import os
import re
from datetime import timedelta
import logging

from sandbox.sandboxsdk.errors import SandboxTaskFailureError
from sandbox.sandboxsdk.paths import make_folder
import sandbox.sandboxsdk.parameters as sdk_parameters

from sandbox.projects import resource_types
from sandbox.projects.common.userdata import userdata_base_task, util


class UserBrowsePackage(sdk_parameters.LastReleasedResource):
    name = 'user_browse_package_resid'
    description = 'Resource with user_browse package'
    resource_type = resource_types.USERFEAT_USER_BROWSE_PACKAGE
    group = userdata_base_task.PACKAGES_GROUP_NAME


class IpregResource(sdk_parameters.LastReleasedResource):
    name = 'ipreg_resource_id'
    description = 'Resource with IPREG and co'
    required = False  # don't require it yet
    resource_type = resource_types.IPREG_SET
    group = userdata_base_task.DATA_FILES_GROUP_NAME


class ClickDaemonKeysResource(sdk_parameters.LastReleasedResource):
    name = 'clickdaemon_keys_resource_id'
    description = 'Resource with CLICKDAEMON_KEYS'
    required = False
    resource_type = resource_types.CLICKDAEMON_KEYS
    group = userdata_base_task.DATA_FILES_GROUP_NAME


class UserBrowseUpdateDataRun(userdata_base_task.Task):
    """
        Runs user_browse/scripts/update.py on a stored sample of user_sessions
    """

    type = 'USER_BROWSE_UPDATE_DATA'

    execution_space = 90000
    node_chunk_store_quota = 20 << 30
    forbid_chunk_storage_in_tmpfs = True
    yt_testable = True

    input_parameters = util.smart_join_params(
        userdata_base_task.Task.input_parameters,
        userdata_base_task.RflForCanonizerParameter,
        UserBrowsePackage,
        IpregResource,
        ClickDaemonKeysResource,
    )

    @util.lazy_property
    def dates_context(self):
        out = self.get_state_attrs()

        return {
            "days_per_period": int(out["days_per_period"]),
            "last_date": out["last_date"],
            "num_periods": int(out["num_periods"]),
            "first_date": util.get_first_date(
                out["last_date"],
                int(out["days_per_period"]) * int(out["num_periods"])
            ),
            "comm_cache_date": out["comm_cache_date"],
            "similargroup_data_delay": 2,  # TODO: make it a task parameter
        }

    # returns dict of config settings discovered while parsing
    # right now only
    #   has_delay_for_data_src : bool
    def create_local_config(self):
        dates_ctx = self.dates_context

        ret_config_settings = {'has_delay_for_data_src': False}

        def custom_line_processor(l):
            m = re.search(r"^webQSIndexVersion\s*=\s*(\d+)", l)
            if m:
                self.ctx["data_format_version"] = int(m.group(1))
                return l

            # support patching in dailyDataPartsSpecs and (small,normal,full)DailyDataPartsSpec
            # used in dailyFeaturesSpecs
            m = re.match(r"^(.*self.DailyDataPartsSpec[^,]+)(.+)$", l)
            if m:
                l = (
                    m.group(1) +
                    (", {days_per_period}, {num_periods} - 1 - {similargroup_data_delay}".format(
                        **dates_ctx
                    )) +
                    "),"
                )
                return l

            m = re.match(r"^(.*)delayForDataSrc\s=.+$", l)
            if m:
                ret_config_settings['has_delay_for_data_src'] = True
                l = m.group(1) + (
                    "delayForDataSrc = {{ 'spy_log': 0, 'similargroup': {similargroup_data_delay} }}".format(
                        **dates_ctx
                    )
                )
                return l

            return l

        self.config_patcher.patch(
            os.path.join(self.ctx["berkanavt"], "user_browse/scripts/user_browse/user_browse_config.py"),
            {
                "projectRoot": os.path.join(self.ctx["berkanavt"], "user_browse"),
                "remMaxErrLen": 320000,
                "daysPerDataPart": dates_ctx["days_per_period"]
            },
            custom_line_processor
        )

        comp_last_date = util.str2date(dates_ctx["last_date"]) - timedelta(int(dates_ctx["days_per_period"]))

        for d in "data", "base":
            make_folder(os.path.join(self.ctx["berkanavt"], "user_browse", d))
            with open(os.path.join(self.ctx["berkanavt"], "user_browse", d, "web_last_date"), "w") as wld:
                print >> wld, util.date2str(comp_last_date)

        return ret_config_settings

    def prepare_mr(self):
        userdata_base_task.Task.prepare_mr(self)

        periods_ctx = self.dates_context

        num_periods = periods_ctx["num_periods"]

        # 1 period for update shift
        # 2 periods in minimum for any comp/update
        # + delay
        min_num_periods = 1 + 2 + periods_ctx["similargroup_data_delay"]

        if num_periods < min_num_periods:
            raise SandboxTaskFailureError(
                "Number of periods should be {} or more".format(min_num_periods)
            )

        comp_period = dict(periods_ctx)
        comp_period["mode_args"] = "--comp"
        comp_period["num_periods"] = num_periods - 1 - periods_ctx["similargroup_data_delay"]
        comp_period["last_date"] = util.get_last_date(
            comp_period["first_date"], comp_period["days_per_period"] * (num_periods - 1)
        )

        update_period = dict(periods_ctx)
        update_period["mode_args"] = "--preserve_prefix"
        update_period["num_periods"] = num_periods - 1 - periods_ctx["similargroup_data_delay"]
        update_period["first_date"] = util.get_first_date(
            update_period["last_date"], update_period["days_per_period"] * (num_periods - 1)
        )

        self.periods = [comp_period, update_period]

        logging.info("Periods: {}".format(self.periods))

        for p in self.periods:
            if p["comm_cache_date"] == p["last_date"]:
                continue
            for norm in "dopp", "agg":
                self.mr_client.copy_table(
                    self.get_tables_prefix() + "user_browse/main_comm_q/" + norm + "norm/_" + p["comm_cache_date"],
                    self.get_tables_prefix() + "user_browse/main_comm_q/" + norm + "norm/_" + p["last_date"]
                )

    def process_mr_data(self):
        config_settings = self.create_local_config()
        paths = self.get_common_pythonpaths()
        for suffix in "", "/common":
            paths.append(os.path.join(self.ctx["berkanavt"], "user_browse/scripts" + suffix))

        cmd = (
            "cd {berkanavt}/user_browse/scripts/user_browse/;"
            "PYTHONPATH={pythonpath} "
            "MR_OPT= "
            "MR_CLUSTER_INFO={mr_cluster_info} " +
            self.get_client_environ_str() + " " +
            "python ./update.py "
            "{mode_args} "
            "--emails kaa@yandex-team.ru "
            "--no_src_depends "
            "--no_gemini "
            "--mr_table_prefix {tables_prefix} "
            "--last_date {last_date} "
        )
        if config_settings['has_delay_for_data_src']:
            cmd += "--sample_period_oldest_dates {sample_period_oldest_dates} "
        else:
            cmd += "--sample_period_oldest_date {global_first_date} "

        extra_args = [
            "-sample_period_start {global_first_date}",
            "-max_job_memory_for_yt 4000000000",
            "-sample_period_max_rand 3",
            "-sample_period_size 0"
        ]
        cmd += "--comp_factor_data_extra_args '" + " ".join(extra_args) + "' "

        for p in self.periods:
            ctx = self.ctx.copy()
            ctx.update(p)
            ctx["global_first_date"] = self.periods[0]["first_date"]
            ctx["sample_period_size"] = max(2, ctx["num_periods"] * ctx["days_per_period"] / 5)
            if config_settings['has_delay_for_data_src']:
                ctx["sample_period_oldest_dates"] = (
                    "spy_log:" + util.get_first_date(
                        p["last_date"], p["days_per_period"] * p["num_periods"]
                    )
                    + ',similargroup:' + util.get_first_date(
                        p["last_date"],
                        p["days_per_period"] * (
                            p["num_periods"] + self.dates_context["similargroup_data_delay"]
                        )
                    )
                )
            ctx["tables_prefix"] = self.get_tables_prefix()
            ctx["pythonpath"] = ":".join(paths)

            util.run_shell_process("update_data.py", cmd, **ctx)
            # can't run in parallel, update depends on comm
            self.rem_client.wait_all_packets()

    def get_project_bin_dir(self):
        return os.path.join(self.ctx["berkanavt"], "user_browse/bin")

    def updated_result_attrs(self, attrs):
        ctx = self.dates_context
        attrs.update({
            "browse_first_date": ctx["first_date"],
            "browse_last_date": ctx["last_date"],
            "browse_num_periods": ctx["num_periods"],
            "browse_days_per_period": ctx["days_per_period"]
        })
        if "data_format_version" in self.ctx:
            attrs["data_format_version"] = self.ctx["data_format_version"]
        return attrs


__Task__ = UserBrowseUpdateDataRun
