# -*- coding: utf-8 -*-

import os
import logging
import tarfile
import traceback
import datetime
import time
import itertools
import copy
import shutil
import requests
import httplib
import urllib2
import socket
import gzip
import cStringIO
import re

from sandbox.common import errors
from sandbox.sandboxsdk.task import SandboxTask
from sandbox.sandboxsdk.paths import make_folder
from sandbox.sandboxsdk.channel import channel
from sandbox.sandboxsdk import parameters

from sandbox.projects import resource_types

from sandbox.projects.common import decorators
from sandbox.projects.common import error_handlers as eh
from sandbox.projects.common import string
from sandbox.projects.common import utils
from sandbox.projects.common.pumpkin import utils as pumpkin_utils
from sandbox.projects.common.pumpkin import parameters as pumpkin_params


_INNER_ROBOT_COOKIES = {'i-m-not-a-hacker': 'ZJYnDvaNXYOmMgNiScSyQSGUDffwfSET'}
_SUBTASK_IDS_KEY = 'subtask_ids'
_VOLUME_ID_KEY = 'volume_id'

_ARCHIVE_DIR = 'archive'
_SUBARCHIVE_DIR = 'subarchive'
_INDEX_FILE = 'index'
_MAIN_PAGE_FILE = 'main.html'
_NOTFOUND_PAGE_FILE = 'notfound.html'
_NOTFOUND_PAGE_FILE_WITHOUT_HEADER = 'notfound_without_header.html'
_SUBINDEX_FILE = 'subindex'

_CONNS_THRESHOLD = 500

_BAD_THUMBS_RETRIES_NUMBER = 2
_BAD_THUMBS_RE = re.compile(r'serp-item__thumb" onerror')
_NO_HEADER_PARAMS = "no_header=1"

_STANDALONE_PAGE_RETRIES_NUMBER = 3


def _make_tar_member(filename, content):
    member = tarfile.TarInfo(name=filename)
    member.size = len(content)
    return member, cStringIO.StringIO(content)


class PoolImapFunction(object):
    """
    This is a picklable wrapper for functions.
    It is used for call functions from multiprocessing.pool.imap*
    IT IS NOT TRADITIONAL DECORATOR. This code, for example, would not pickle:

        @PoolImapFunction
        def g(x):
            print 'g called'

    because it expands to:

        def g(x):
            print 'g called'
        g = PoolImapFunction(g)

    so, pickle will raise an exception.

    THE RIGHT WAY OF USING THIS WRAPPER:

        def g(x):
            print 'g called'

        h = PoolImapFunction(g)

    """

    def __init__(self, func):
        self.func = func

    def __call__(self, args):
        return self.func(*args)


@decorators.retries(
    max_tries=3,
    delay=0.2,
    backoff=1,
    exceptions=(
        httplib.HTTPException,
        urllib2.URLError,
        socket.error,
    )
)
def failsafe_request_get(url, **kwargs):
    return requests.get(url, **kwargs).content


def retry_on_bad_thumbs(request):
    def wrapper(*args, **kwargs):
        for i in xrange(_BAD_THUMBS_RETRIES_NUMBER + 1):
            serp = request(*args, **kwargs)
            if not _BAD_THUMBS_RE.search(serp):
                return serp
        raise Exception("Bad thumbs detected during all {} retries".format(_BAD_THUMBS_RETRIES_NUMBER))
    return wrapper


def identity(x):
    return x


def get_and_proc_serp(url, extra_params=[], add_params='', disable_cleanup=False, check_thumbs=False, header_delimeter=''):
    """
    Gets fetch and index URLs (common.pumpkin.make_urls call result), fetches SERP by fetch URL,
    processes fetched SERP and stores it to file named by hash

    """

    logging.debug('get_and_proc_serp({0}, {1}, {2}, {3}, {4}, {5})'.format(
        url,
        extra_params,
        add_params,
        disable_cleanup,
        check_thumbs,
        header_delimeter,
    ))

    def cut_header(serp):
        try:
            return bytes(str(serp).split(header_delimeter)[1])
        except IndexError:
            # serp does not contain this delimeter
            eh.fail("Serp does not contain header delimeter '{}', which must be present".format(header_delimeter))

    index_url = hashname = content = zipped_content = exception = None

    try:

        # FIXME redefinition of unused 'Proc'
        from cleanup_serp import Proc

        _requests = failsafe_request_get
        _serp_proc = identity if disable_cleanup else Proc().run
        _intelligent_retry = retry_on_bad_thumbs if check_thumbs else identity
        # check that delimeter is set and that query contains cgi parameter for no header serp
        _cut_header = cut_header if header_delimeter and _NO_HEADER_PARAMS in url else identity

        default_parameters = ['text', 'no_header']
        index_parameters = default_parameters + extra_params
        exclude_from_fetch_params = ['no_header']
        index_url, fetch_url, hashname = pumpkin_utils.make_urls(
            url,
            index_params=index_parameters,
            add_params=add_params,
            exclude_from_fetch_params=exclude_from_fetch_params,
        )

        content = _serp_proc(_cut_header(_intelligent_retry(_requests)(fetch_url, cookies=_INNER_ROBOT_COOKIES)))

        fileobj = cStringIO.StringIO()
        zipped_file = gzip.GzipFile(filename=hashname + '.html', fileobj=fileobj, mode='wb')
        zipped_file.write(content)
        zipped_file.close()
        zipped_content = fileobj.getvalue()
    except:
        exception = traceback.format_exc()

    return index_url, hashname, content, zipped_content, exception


imap_get_and_proc_serp = PoolImapFunction(get_and_proc_serp)


def safe_sync_resource(channel, resource, retries=None, timeout=300):
    for _ in (xrange(retries) if retries else itertools.count()):
        try:
            return channel.task.sync_resource(resource)
        except errors.TaskError:
            time.sleep(timeout)
    return channel.task.sync_resource(resource)


class UrlsResource(parameters.LastReleasedResource):
    name = 'UrlsResource'
    description = 'Set of urls to retrieve'
    resource_type = 'SERP_COLLECTOR_URLS'


class MainPageUrl(parameters.SandboxStringParameter):
    name = 'MainPageUrl'
    description = 'Main page url to retrive'
    required = False


class NotFoundPageUrl(parameters.SandboxStringParameter):
    name = 'NotFoundPageUrl'
    description = '"Not found" page url to retrive'
    required = False


class NotFoundPageUrlWithoutHeader(parameters.SandboxStringParameter):
    name = 'NotFoundPageUrlWithoutHeader'
    description = '"Not found" page url to retrive without header'
    required = False


class SeriesName(parameters.SandboxStringParameter):
    name = 'SeriesName'
    description = 'Name of the series'
    default_value = 'generalized'
    required = True


class ExtraCGIParams(parameters.SandboxStringParameter):
    name = 'ExtraCGIParams'
    description = 'CGI parameters to be kept in requests (besides text and lr). Example: nocache,i-m-a-hacker,sbh.'


class ResourceAttrsParam(parameters.SandboxStringParameter):
    name = 'ResourceAttrs'
    description = 'Set attrs to loot resource (e.g.: attr1=v1, attr2=v2)'
    group = 'Options'
    default_value = None


class ConnsLimit(parameters.SandboxIntegerParameter):
    name = 'ConnsLimit'
    description = 'Limit of simultaneous downloads'
    default_value = 100


class DisableSerpCleanup(parameters.SandboxBoolParameter):
    name = 'DisableSerpCleanup'
    description = 'Disable SERP cleanup (SERP-25451)'
    default_value = True


class SerpCollector(SandboxTask):
    type = 'SERP_COLLECTOR'

    execution_space = 25000
    cores = 2
    required_ram = 8072

    input_parameters = (
        UrlsResource,
        MainPageUrl,
        NotFoundPageUrl,
        NotFoundPageUrlWithoutHeader,
        SeriesName,
        ExtraCGIParams,
        pumpkin_params.AddCGIParams,
        ResourceAttrsParam,
        ConnsLimit,
        pumpkin_params.DisregardTime,
        DisableSerpCleanup,
        pumpkin_params.CheckThumbs,
        pumpkin_params.SerpHeaderDelimeterParameter,
    )

    def on_execute(self):
        self.sync_resource(self.ctx[UrlsResource.name])
        urls_resource = channel.sandbox.get_resource(self.ctx[UrlsResource.name])
        series = self.ctx.get(SeriesName.name) or urls_resource.attributes['series']

        if _VOLUME_ID_KEY not in self.ctx:
            self.do_parent(urls_resource, series)
        else:
            self.do_child(self.ctx[_VOLUME_ID_KEY], urls_resource, series)

    def do_parent(self, urls_resource, series):
        if _SUBTASK_IDS_KEY not in self.ctx:
            # 1. checks
            self.time_check(3, 0, 6)
            self.conns_check()

            # 2. get resource attributes: number of workers (n), series id
            with tarfile.open(urls_resource.path, "r") as tar:
                volume_count = len([name for name in tar.getnames() if name != "."])

            # 3. create subtasks
            logging.info("creating subtasks...")
            self.ctx[_SUBTASK_IDS_KEY] = [
                self.run_child(i, volume_count, series) for i in xrange(volume_count)
            ]

        # 4. ensure that all tasks finished
        subtask_ids = self.ctx[_SUBTASK_IDS_KEY]
        subtasks = [channel.sandbox.get_task(subtask_id) for subtask_id in subtask_ids]
        unfinished_tasks = []
        for subtask in subtasks:
            if not subtask.is_done():
                unfinished_tasks.append(subtask.id)
        if unfinished_tasks:
            self.wait_all_tasks_completed(unfinished_tasks)

        utils.check_if_tasks_are_ok(subtask_ids)

        # 5. create resource
        zip_only = self.ctx.get(pumpkin_params.ZipOnly.name)
        logging.info("creating resource archive...")
        cur_time = datetime.datetime.today().strftime('%Y.%m.%d_%H-%M')
        tarname = '{0}_{1}_{2}.{3}'.format('archive', cur_time, series, "tar" if zip_only else "tgz")

        attrs = {}
        if self.ctx.get(ResourceAttrsParam.name):
            attrs = string.parse_attrs(self.ctx[ResourceAttrsParam.name])
        attrs['series_and_volume'] = '{0} {1}'.format(series, -1)
        attrs['backup_task'] = 1

        loot_resource = self.create_resource(
            description='Collected serps: id {0} time {1}'.format(series, cur_time),
            resource_path=tarname,
            resource_type=resource_types.SERP_COLLECTOR_LOOT,
            arch='any',
            attributes=attrs,
        )
        with tarfile.open(loot_resource.path, "w" if zip_only else "w|gz") as tar:
            # 6. download workers' results & glue up
            with open(_INDEX_FILE, "w") as accum_index:
                for subtask in subtasks:
                    subarchive = safe_sync_resource(channel, subtask.ctx['subarchive'])
                    with tarfile.open(subarchive) as subtar:
                        for member in subtar:
                            if member.name == "./subindex":
                                fileobj = subtar.extractfile(member)
                                shutil.copyfileobj(fileobj, accum_index)
                            else:
                                tar.addfile(member, subtar.extractfile(member))
            tar.add(_INDEX_FILE)

            # 7. add main page if needed
            main_page_url = self.ctx.get(MainPageUrl.name)
            check_thumbs = utils.get_or_default(self.ctx, pumpkin_params.CheckThumbs)
            if main_page_url:
                for _ in xrange(_STANDALONE_PAGE_RETRIES_NUMBER):
                    index_url, hashname, content, zipped_content, exception = \
                        get_and_proc_serp(main_page_url, check_thumbs=check_thumbs)
                    if not exception:
                        break
                else:
                    eh.fail("Main page download failed:\n{}".format(exception))

                if not zip_only:
                    tar.addfile(*_make_tar_member(_MAIN_PAGE_FILE, content))
                tar.addfile(*_make_tar_member(_MAIN_PAGE_FILE + ".gz", zipped_content))

            # 8. add 'not found' page (with header and without) if needed
            for resource_name, file_name in zip([NotFoundPageUrl.name, NotFoundPageUrlWithoutHeader.name], [_NOTFOUND_PAGE_FILE, _NOTFOUND_PAGE_FILE_WITHOUT_HEADER]):
                notfound_page_url = self.ctx.get(resource_name)
                header_delimeter = utils.get_or_default(self.ctx, pumpkin_params.SerpHeaderDelimeterParameter)
                if notfound_page_url:
                    for _ in xrange(_STANDALONE_PAGE_RETRIES_NUMBER):
                        index_url, hashname, content, zipped_content, exception = \
                            get_and_proc_serp(notfound_page_url, header_delimeter=header_delimeter)
                        if not exception:
                            break
                    else:
                        eh.fail("404 page download failed:\n{}".format(exception))

                    if not zip_only:
                        tar.addfile(*_make_tar_member(file_name, content))
                    tar.addfile(*_make_tar_member(file_name + ".gz", zipped_content))

        # 9. remove temporary resources
        for subtask in subtasks:
            resource_id = int(subtask.ctx['subarchive'])
            logging.info('Removing temporary resource {0}'.format(resource_id))
            channel.sandbox.delete_resource(resource_id, ignore_last_usage_time=True)

    def do_child(self, volume, urls_resource, series):
        logging.info("Get Volume Content routine: volume {0}".format(volume))

        from multiprocessing import Pool

        # unpack urls_resource
        with tarfile.open(urls_resource.path) as tar:
            try:
                tar.extract(str(volume))
            except KeyError:
                tar.extract("./{0}".format(volume))
        urls = open(str(volume))

        # process
        zip_only = self.ctx.get(pumpkin_params.ZipOnly.name)
        make_folder(_SUBARCHIVE_DIR, delete_content=True)
        subindex_file = open(os.path.join(_SUBARCHIVE_DIR, _SUBINDEX_FILE), 'w')

        pool = Pool(self.ctx[ConnsLimit.name])
        extra_params = [s.strip() for s in self.ctx[ExtraCGIParams.name].split(',')]
        add_params = self.ctx[pumpkin_params.AddCGIParams.name]
        disable_cleanup = self.ctx[DisableSerpCleanup.name]
        check_thumbs = utils.get_or_default(self.ctx, pumpkin_params.CheckThumbs)
        header_delimeter = utils.get_or_default(self.ctx, pumpkin_params.SerpHeaderDelimeterParameter)
        try:
            stripped_urls = (url.strip('\n') for url in urls)
            ok_urls = (url for url in stripped_urls if urls)

            for url, hashname, content, zipped_content, exception in pool.imap_unordered(
                    imap_get_and_proc_serp,
                    ((url, extra_params, add_params, disable_cleanup, check_thumbs, header_delimeter) for url in ok_urls)):
                if exception:
                    logging.warning('{0}\n{1}'.format(url, exception))
                else:
                    inner_dirpath = os.path.join(hashname[0], hashname[1], hashname[2])
                    inner_filepath = os.path.join(inner_dirpath, hashname + '.html')
                    dirpath = os.path.join(_SUBARCHIVE_DIR, inner_dirpath)
                    filepath = os.path.join(dirpath, hashname + '.html')
                    if not os.path.exists(dirpath):
                        os.makedirs(dirpath)
                    if not zip_only:
                        with open(filepath, 'w') as f:
                            f.write(content)
                    with open(filepath + '.gz', 'w') as f:
                        f.write(zipped_content)
                    subindex_file.write('{0}\t{1}\n'.format(url, inner_filepath))
            pool_failed = False
        except:
            logging.exception("Exception during parallel download/processing")
            pool_failed = True
        finally:
            pool.close()
            pool.join()
            subindex_file.close()
            eh.ensure(not pool_failed, "Parallel download/processing failed")

        # pack subarchive
        cur_time = datetime.datetime.today().strftime('%Y.%m.%d_%H-%M')
        tarname = '{0}_{1}_{2}.{3}'.format('subarchive', cur_time, volume, "tar" if zip_only else "tgz")
        with tarfile.open(tarname, "w" if zip_only else "w|gz") as tar:
            tar.add(_SUBARCHIVE_DIR, ".")

        logging.info("about to create volume resorce")
        subarch_res = self.create_resource(
            description='Partially collected serps: id {0} volume {1} time {2}'.format(series,
                                                                                       volume + 1,
                                                                                       cur_time),
            resource_path=tarname,
            resource_type=resource_types.SERP_COLLECTOR_TEMP_LOOT,
            arch='any',
            attributes={
                'series_and_volume': '{0} {1}'.format(series, volume),
            }
        )
        logging.info("volume resource created")
        self.ctx['subarchive'] = subarch_res.id

        logging.info("finishing get_volume_content")

    def run_child(self, volume, volume_count, series):
        sub_ctx = copy.deepcopy(self.ctx)
        sub_ctx[ConnsLimit.name] = self.ctx[ConnsLimit.name] / volume_count
        sub_ctx[_VOLUME_ID_KEY] = volume

        sub_task = self.create_subtask(
            task_type='SERP_COLLECTOR',
            input_parameters=sub_ctx,
            description="subtask for \"{0}\" series SerpCollector, vol #{1} of {2}".format(
                series, volume, volume_count))
        return sub_task.id

    def time_check(self, _tz, _from, _to):
        if not self.ctx[pumpkin_params.DisregardTime.name]:
            nowhour = datetime.datetime.utcnow().hour
            eh.ensure(
                _from < (nowhour + _tz) % 24 < _to,
                "It is not recommended to run task in daytime. "
                "Use 'Run task regardless of time' option to pass the restriction."
            )

    def conns_check(self):
        conns_limit = self.ctx[ConnsLimit.name]
        eh.ensure(conns_limit <= _CONNS_THRESHOLD, "Potentially disastrous simultaneous connections number.")
        eh.ensure(conns_limit > 0, "{0} connections is somehow confusing...".format(conns_limit))


__Task__ = SerpCollector
