import datetime
import os

from library.cpp.retry.protos import retry_options_pb2

from crypta.lib.native.tvm.proto import tvm_config_pb2
from crypta.lib.python.tvm import helpers as tvm_helpers
from crypta.lib.python.yt import schema_utils
from crypta.siberia.bin.common.create_user_set_from_sample_reducer.proto import (
    describe_in_siberia_pb2,
    ids_to_describe_pb2,
    group_user_set_pb2,
    pre_describe_filter_combiner_state_pb2,
    pre_describe_filter_state_pb2,
)
from crypta.siberia.bin.common.create_user_set_from_sample_reducer.py import native_operations
from crypta.siberia.bin.common.describing.experiment.proto import describing_experiment_pb2
import crypta.siberia.bin.common.describing.mode.python.describing_mode as siberia_describing_mode


GROUP_ID_PROTO_COLUMN = "GroupID"
YT_SECURE_VAULT_TVM_SECRET_ENV_VAR = "YT_SECURE_VAULT_{}".format(tvm_helpers.TVM_SECRET_ENV_VAR)


def create_user_set_from_sample(
        yt_client,
        native_map_reduce_with_combiner,
        native_map,
        source,
        destination,
        group_id_column,
        id_column,
        sample_size,
        tvm_settings,
        experiment=None,
        id_type=None,
        id_type_column=None,
        siberia_host="siberia.crypta.yandex.net",
        siberia_port=80,
        timeout=datetime.timedelta(seconds=5),
        max_ids_per_second=50000,
        max_jobs=25,
        user_data_stats_options=None,
        describing_mode=siberia_describing_mode.SLOW,
):
    combiner_state = pre_describe_filter_combiner_state_pb2.TPreDescribeFilterCombinerState(
        SampleSize=sample_size,
    )
    if user_data_stats_options:
        combiner_state.UserDataStatsOptions.CopyFrom(user_data_stats_options)

    columns_to_rename = {id_column: "IdValue", group_id_column: GROUP_ID_PROTO_COLUMN}
    if id_type_column:
        columns_to_rename[id_type_column] = "IdType"
    else:
        combiner_state.IdType = id_type

    reducer_state = pre_describe_filter_state_pb2.TPreDescribeFilterState(
        SampleSize=sample_size,
    )

    native_map_reduce_with_combiner(
        mapper_name=native_operations.TEmptyMapper,
        combiner_name=native_operations.TPreDescribeFilterCombiner,
        reducer_name=native_operations.TPreDescribeFilter,
        source=yt_client.TablePath(source, rename_columns=columns_to_rename),
        destination=yt_client.TablePath(destination, schema=schema_utils.get_schema_from_proto(ids_to_describe_pb2.TIdsToDescribe)),
        sort_by=GROUP_ID_PROTO_COLUMN,
        reduce_by=GROUP_ID_PROTO_COLUMN,
        combiner_state=combiner_state.SerializeToString(),
        reducer_state=reducer_state.SerializeToString(),
        spec={
            "reduce_job_io": {
                "table_writer": {
                    "max_row_weight": 128 * 2**20,
                },
            },
            "force_reduce_combiners": True,
        }
    )

    tvm_api_settings = tvm_config_pb2.TTvmApiConfig(
        SelfClientId=tvm_settings["source_id"],
        Destinations={"siberia": tvm_settings["destination_id"]},
    )
    tvm_test_port = os.environ.get(tvm_helpers.TVM_TEST_PORT_ENV_VAR)
    if tvm_test_port is not None:
        tvm_api_settings.TvmTestPort = int(tvm_test_port)

    mapper_state = describe_in_siberia_pb2.TDescribeInSiberiaState(
        Tvm=tvm_api_settings,
        Host=siberia_host,
        Port=siberia_port,
        TimeoutSec=int(timeout.total_seconds()),
        RetryOptions=retry_options_pb2.TRetryOptionsPB(
            MaxTries=10,
            InitialSleepMs=1000,
            SleepIncrementMs=1000,
            ExponentalMultiplierMs=0,
        ),
        MaxIdsPerSecond=max_ids_per_second / max_jobs,
        DescribingMode=describing_mode,
        Experiment=describing_experiment_pb2.TDescribingExperiment(CryptaIdUserDataVersion=experiment),
    )

    native_map(
        mapper_name=native_operations.TDescribeInSiberia,
        source=destination,
        destination=yt_client.TablePath(destination, schema=schema_utils.get_schema_from_proto(group_user_set_pb2.TGroupUserSet)),
        state=mapper_state.SerializeToString(),
        spec={
            "secure_vault": {
                tvm_helpers.TVM_SECRET_ENV_VAR: tvm_settings["secret"],
            },
            "max_speculative_job_count_per_task": 0,
            "resource_limits": {
                "user_slots": max_jobs,
            },
            "job_io": {
                "table_writer": {
                    "max_row_weight": 128 * 2**20,
                },
            },
            "data_size_per_job": 2**29,
        }
    )

    new_output_schema = schema_utils.get_schema_from_proto(group_user_set_pb2.TGroupUserSet)
    for column in new_output_schema:
        if column["name"] == GROUP_ID_PROTO_COLUMN:
            column["name"] = group_id_column

    yt_client.run_sort(
        yt_client.TablePath(destination, rename_columns={GROUP_ID_PROTO_COLUMN: group_id_column}),
        yt_client.TablePath(destination, schema=new_output_schema),
        sort_by=group_id_column,
    )
