# -*- coding: utf-8 -*-

import os
import logging
import datetime
import json
import time
import subprocess
import sys

from sandbox import common
import sandbox.common.types.resource as ctr

import sandbox.sandboxsdk.svn as sdk_svn
from sandbox.sandboxsdk.task import SandboxTask
from sandbox.projects.common.utils import check_if_tasks_are_ok
from sandbox.projects import resource_types
from sandbox.projects.common.utils import set_resource_attributes
from sandbox.sandboxsdk.process import run_process
from sandbox.sandboxsdk.parameters import SandboxStringParameter
from sandbox.sandboxsdk.parameters import SandboxBoolParameter
from sandbox.sandboxsdk.parameters import SandboxIntegerParameter
from sandbox.sandboxsdk.parameters import ResourceSelector
from sandbox.sandboxsdk import environments
import sandbox.projects.common.constants as consts
from sandbox.projects.common.vcs import arc
import sandbox.projects.logs.common as us_ci
import re

FOLDER_FOR_USER_SESSIONS_PROCESSES_TESTING = 'user-sessions-processes-ci'
ALLDIFFS_FOLDER = 'alldiffs'
RAW_LOGS_FOLDER = 'raw_logs'
FETCHED_MARK_TABLE_NAME = 'fetched'
FETCHING_DONE = 'fetching_done'


class FetchToolId(ResourceSelector):
    name = 'fetch_tool_id'
    description = 'Идентификатор ресурса бинарника, который делает hashed-выжимки по timestamp; если не передать, сам сбилдит'


class YtToolId(ResourceSelector):
    name = 'yt_resource_id'
    description = 'Идентификатор ресурса бинарника yt; если не передать, сам сбилдит'


class MapreduceId(ResourceSelector):
    name = 'mapreduce-yt_resource_id'
    description = 'Идентификатор ресурса бинарника mapreduce-yt; если не передать, сам сбилдит'


class OriginalLogsPathsPrefix(SandboxStringParameter):
    name = 'yt_prefix'
    description = 'Original logs paths prefix'
    default_value = '//'


class MapreduceCluster(SandboxStringParameter):
    """
        Кластер, на котором извлекаем данные
    """
    name = 'mr_cluster'
    description = 'MR cluster where data will be fetched'
    default_value = 'hahn'


class FetchesFolder(SandboxStringParameter):
    """
        Папка, куда нужно складывать выжимки
    """
    name = 'fetches_folder'
    description = 'Folder on a MR cluster where fetched data will be stored'
    default_value = 'autotest_create_sessions'


class FetchDate(SandboxStringParameter):
    name = 'fetch_date'
    description = 'Дата, за которую фетчим. Если не задана, то берется самая последняя из тех, за которую есть все нужные логи.'
    default_value = ''


class LogsList(SandboxStringParameter):
    name = 'logs_list'
    description = ('Список логов, из которых строим выжимку. Если не задан, то берем его из ресурса USERDATA_LOGS_TO_CHECK. Но если включён режим фетчинга '
                   ' для rem_processes_testing, то данное поле и ресурс USERDATA_LOGS_TO_CHECK игнорируются.')
    default_value = ''


class FetchForRemProcessesTesting(SandboxBoolParameter):
    name = 'fetch_for_rem_processes_testing'
    description = ('Делать выжимки для полного тестирования user_sessions_processes. То есть полноценные выжимки, для всех логов из списка '
                   'quality/user_sessions/rem_processes/sessions_config.py, в том числе для fast-логов. '
                   'Также будет сохраняться структура папок логов в продакшне. '
                   'Если True, то игнорируется поле списка логов и ресурс USERDATA_LOGS_TO_CHECK.')
    default_value = False


class RemServer(SandboxStringParameter):
    name = 'rem_server'
    description = 'REM, с которого получаем информацию о доехавших до кедра логах'


class DebugMode(SandboxStringParameter):
    name = 'debug_mode'
    description = 'Режим отладки. Не трогайте этот параметр'
    default_value = ''


class CreateFreshdataResource(SandboxBoolParameter):
    name = 'create_freshdata_resource'
    description = 'Создавать ли ресурс с путем к выжимке. При запуске в продакшене этот параметр должен быть True.'
    default_value = False


class FreshdataResourceType(SandboxStringParameter):
    name = 'freshdata_resource_type'
    default_value = 'YT_SESSIONS_FRESHDATA_INFO'
    description = 'Имя типа output-ресурса с путём к выжимке. По умолчанию {}'.format(default_value)


class RowsToFetchCount(SandboxIntegerParameter):
    name = 'rows_to_fetch_count'
    description = 'Сколько записей выжать из лога'
    default_value = 100


class TryFastInput(SandboxBoolParameter):
    name = 'try_fast_input_for_timestamp_hashed_fetch'
    default_value = True
    description = 'Брать ли из получасовых (если таблицы нету, то берёт дневную): ' + str(default_value)


class MinCount(SandboxIntegerParameter):
    name = 'fetch_minutes_size'
    default_value = 10
    description = 'Сколько минут выжать из лога, default_value: ' + str(default_value)


class FetchHourBeg(SandboxIntegerParameter):
    name = 'fetch_hour_beg'
    default_value = 9
    description = 'Начиная с какого часа из суток выжимать указанное количество минут, default_value: ' + str(default_value)


class FetchingMode(SandboxStringParameter):
    name = 'fetching_mode'
    description = 'Как именно делать выжимку'
    by_timestamp = 'by_timestamp'
    random_n_rows = 'random_n_rows'
    possible_modes = [by_timestamp, random_n_rows]
    choices = zip(possible_modes, possible_modes)
    default_value = by_timestamp
    sub_fields = {random_n_rows: [RowsToFetchCount.name], by_timestamp: [FetchHourBeg.name, MinCount.name, TryFastInput.name]}


class SubfolderName(SandboxStringParameter):
    name = 'subfolder_name'
    description = 'Имя папки, которую таск создаст в корневой папке, указанной пользователем'
    fetched_logs_date = 'fetched_logs_date'
    fetching_date = 'fetching_date (today)'
    possible_names = [fetched_logs_date, fetching_date]
    choices = zip(possible_names, possible_names)
    default_value = fetched_logs_date


class YTToken(SandboxStringParameter):
    name = 'yt_token'
    description = 'Name of secret with YT token (from sb-vault)'


class YTTokenOwner(SandboxStringParameter):
    name = 'yt_token_owner'
    description = 'Owner of secret with YT token (default: owner of task)'


class YTPool(SandboxStringParameter):
    name = 'yt_pool'
    description = 'YT pool has to be explicitly set! For example, userdata-sessions-build-ci. Otherwise process might work slowly'
    required = True


class CopyResources(SandboxBoolParameter):
    name = 'copy_resources'
    description = 'НЕ РАБОТАЕТ. Временно оставленный параметр для обратной совместимости с реактором'
    default_value = True


def RunProcess(cmd, env, input=None):
    stdin = None
    if input is not None:
        stdin = subprocess.PIPE

    cmd_str = ' '.join(cmd)
    process = run_process(
        cmd_str,
        wait=False,
        outs_to_pipe=True,
        check=False,
        stdin=stdin,
        shell=True,
        environment=env
    )
    result, error = process.communicate(input)
    if process.returncode != 0:
        raise Exception(CutLargeError(error))

    logging.info('RunProcess result: ' + str(result))
    return result, error


def GetYtFullPath(yt_path):
    yt_prefix_appendable_name = yt_path.lstrip('/')
    return os.environ.get('YT_PREFIX', '//') + yt_prefix_appendable_name


def CalculateClusterName(host):
    if host in ['cedar00:8013', 'cedar00.search.yandex.net:8013', 'cedar']:
        return 'cedar'
    elif host in ['sakura00:8013', 'sakura00.search.yandex.net:8013', 'sakura']:
        return 'sakura'
    elif host == 'hahn':
        return 'hahn'
    else:
        raise Exception('Dont know cluster name for ' + host)


def CutLargeError(errorMsg):
    maxLen = 6000
    if len(errorMsg) > maxLen:
        return errorMsg[:maxLen / 2] + '\n<...>\n' + errorMsg[-maxLen / 2:]
    else:
        return errorMsg


def ConfigureSysPath(arc_dir):
    sys.path[:0] = [
        os.path.join(arc_dir, "quality", "user_sessions", "rem_processes"),
        os.path.join(arc_dir, "quality", "user_sessions", "reactor"),
        os.path.join(arc_dir, "rem", "client"),
        os.path.join(arc_dir, "mapreduce", "library", "mr_packet_lib"),
        os.path.join(arc_dir, "logfeller", "python"),
    ]


class YtFetcher(object):
    def __init__(self, task_context, yt_token):
        self.ctx = task_context
        self.MRServer = self.ctx['mr_cluster']
        self.REMServer = self.ctx['rem_server']
        self.yt_token = yt_token

        import yt.wrapper as yt

        yt.config["proxy"]["url"] = "hahn"
        yt.config["token"] = self.yt_token
        yt.config['read_retries']['allow_multiple_ranges'] = True

    def BuildBinariesIfNeeded(self, base_task):
        if 'child_tasks_ids' not in self.ctx:
            sub_tasks = []

            task = base_task.create_subtask(
                task_type='BUILD_SEARCH',
                input_parameters={
                    self.GetFetchBinaryBuildName(): True,
                    'build_mr_ls': True,
                    'build_mr_rm': True,
                    'build_yt': True,
                    self.GetMapreduceBinaryBuildName(): True,
                    consts.ARCADIA_URL_KEY: 'arcadia:/arc/trunk/arcadia@HEAD',
                    'build_system': 'semi_distbuild',
                    'use_aapi_fuse': True,
                    'aapi_fallback': True,
                },
                description='Building data fetcher to test create_sessions')
            sub_tasks.append(task.id)

            self.ctx['fetching_by_timestamp_resource_id'] = task.ctx[self.GetFetchBinaryResourceIdName()]
            self.ctx['mr_ls_resource_id'] = task.ctx['mr_ls_resource_id']
            self.ctx['mr_rm_resource_id'] = task.ctx['mr_rm_resource_id']
            self.ctx['yt_resource_id'] = task.ctx['yt_resource_id']
            self.ctx['mapreduce_resource_id'] = task.ctx[self.GetMapreduceBinaryResourceIdName()]
            self.ctx['child_tasks_ids'] = sub_tasks
            base_task.wait_all_tasks_completed(sub_tasks)
        else:
            check_if_tasks_are_ok(self.ctx['child_tasks_ids'])

        path_key_to_id_key = {
            'fetcher_path': 'fetching_by_timestamp_resource_id',
            'mr_ls_path': 'mr_ls_resource_id',
            'mr_rm_path': 'mr_rm_resource_id',
            'yt_path': 'yt_resource_id',
            'mapreduce_path': 'mapreduce_resource_id',
        }
        for path_key, id_key in path_key_to_id_key.items():
            res_id = self.ctx[id_key]
            self.ctx[path_key] = base_task.sync_resource(res_id)
            set_resource_attributes(res_id, {"ttl": "30"})

        logging.info('Fetcher path: %s' % self.ctx['fetcher_path'])
        logging.info('mr_ls path: %s' % self.ctx['mr_ls_path'])
        logging.info('mr_rm path: %s' % self.ctx['mr_rm_path'])
        logging.info('yt path: %s' % self.ctx['yt_path'])

    def LogUploadTag(self, log, date_time, period, cluster_name):
        import logs_lib
        import mr_packet_lib

        tag_name = logs_lib.getLogUploadTag(log, date_time, period)
        return mr_packet_lib.get_cluster_tag(cluster_name, tag_name)

    def SeekLogsForDateTime(self, logs_to_fetch, date_time, period, cluster_name, rem_connection, GetTagsBulk):
        tags_list = [self.LogUploadTag(log, date_time, period, cluster_name) for log in logs_to_fetch]
        logging.info('looking in REM for tags: ' + str(tags_list))

        bulk = GetTagsBulk(rem_connection, tags=tags_list)
        bulk2 = bulk.FilterSet()

        return (len(bulk2.GetTags()) == len(logs_to_fetch))

    def CalculateSourcesDateTime(self, logs_to_fetch, period):
        import time_util

        now = datetime.datetime.now()

        # TODO: tmp - before support reactor
        return time_util.datetime_round(now - datetime.timedelta(days=5), period)

        raise Exception('No fresh logs found for period {}'.format(str(period)))

    def FetchByTimestamp(self, working_folder, logs_to_fetch, date_time, period, task):
        time_interval_hours = self.ctx[FetchHourBeg.name]
        timestamp_begin, timestamp_end = self.CalcTimestamps(date_time, hours_to_add_to_date_time=time_interval_hours)

        for log in logs_to_fetch:
            cmd = self.MakeFetchByTimestampCmd(log.strip(), date_time, working_folder, timestamp_begin, timestamp_end, period,
                                               timedelta_if_fast_input=datetime.timedelta(hours=time_interval_hours) if self.ctx[TryFastInput.name] else None, task=task)

            us_ci.RunProcesses([cmd], self.GetEnv())

    def FetchNRows(self, working_folder, logs_to_fetch, date_time, period, task):
        from us_processes import yamr2yt_log_names_converter  # noqa
        import yt.wrapper as yt
        yt.config["proxy"]["url"] = "hahn"
        yt.config["token"] = self.yt_token
        yt.config['read_retries']['allow_multiple_ranges'] = True

        def calc_random_ranges(sample_row_count, total_row_count):
            sample_row_count = min(sample_row_count, total_row_count)
            lower_limit = min(int(total_row_count) / 2, total_row_count - sample_row_count)
            upper_limit = lower_limit + sample_row_count - 1

            ranges = []
            ranges += [{'lower_limit': {'row_index': lower_limit}, 'upper_limit': {'row_index': upper_limit}}]
            return sorted(ranges, key=lambda x: x['lower_limit']['row_index'])

        for log in logs_to_fetch:
            ytLogName = yamr2yt_log_names_converter.getYTLogName(log)

            input_table = self.GetInputRawLogPath(ytLogName, date_time, period)
            output_table = self.GetOutputRawLogPath(working_folder, ytLogName, date_time, period)
            input_row_count = yt.get_attribute(GetYtFullPath(input_table), 'row_count')
            ranges = calc_random_ranges(self.ctx[RowsToFetchCount.name], input_row_count)

            input_format = yt.YamrFormat(has_subkey=True)
            if yt.has_attribute(GetYtFullPath(input_table), '_format'):
                format_name = yt.get(GetYtFullPath(input_table) + '/@_format', format=yt.YsonFormat())
                input_format = yt.create_format(format_name)

            yt.run_map('cat', yt.TablePath(GetYtFullPath(input_table), ranges=ranges), GetYtFullPath(output_table), input_format=input_format, output_format=yt.YamrFormat(has_subkey=True))

    def Fetch(self, working_folder, logs_to_fetch, date_time, period, task):
        self.CreatePathsIfNeeded(working_folder, logs_to_fetch, date_time, period)

        if self.ctx[FetchingMode.name] == FetchingMode.by_timestamp:
            self.FetchByTimestamp(working_folder, logs_to_fetch, date_time, period, task)
        else:
            self.FetchNRows(working_folder, logs_to_fetch, date_time, period, task)

        one_fetch_folder = self.OneFetchFolder(working_folder)
        self.PlaceMRMark(us_ci.MRPathJoin(one_fetch_folder, FETCHED_MARK_TABLE_NAME), '1\t')

    def RemoveOldSubfolders(self, root_folder, subfolder_to_leave_count, special_subfolders_to_leave=[]):
        all_subfolders = self.GetAllFolders(root_folder)
        subfolders_to_remove = list(all_subfolders - set(special_subfolders_to_leave))
        subfolders_to_remove.sort()

        for subfolder in subfolders_to_remove[:-subfolder_to_leave_count]:
            self.RemoveFolder(us_ci.MRPathJoin(root_folder, subfolder))

        for subfolder in subfolders_to_remove[-subfolder_to_leave_count:]:
            self.SetReplicationFactor(us_ci.MRPathJoin(root_folder, subfolder), replication_factor=2)

    def RemoveBinsCIDiffs(self, meat_folder):
        diffs_to_leave_count = 200
        full_path_to_alldiffs = us_ci.MRPathJoin(meat_folder, ALLDIFFS_FOLDER)
        self.RemoveOldSubfolders(full_path_to_alldiffs, diffs_to_leave_count)

    def RemoveScriptsCIOutputs(self, meat_folder):
        outputs_to_leave_count = 30
        current_fetching_date_str = '2016-12-12'
        full_path_to_scripts_ci_folder = us_ci.MRPathJoin(meat_folder, FOLDER_FOR_USER_SESSIONS_PROCESSES_TESTING, current_fetching_date_str)
        self.RemoveOldSubfolders(full_path_to_scripts_ci_folder, outputs_to_leave_count, [RAW_LOGS_FOLDER])

    def RemoveOtherOldData(self, meat_folder):
        self.RemoveBinsCIDiffs(meat_folder)
        self.RemoveScriptsCIOutputs(meat_folder)
        self.RemoveBinsCIBuilds(meat_folder)

    def RemoveWhatYouCanRemoveBeforeFetching(self, meat_folder):
        # this custom function will be removed later, when fetching and removing will become separated

        self.RemoveOldFetchedLogsBeforeNewFetching(meat_folder)
        self.RemoveOtherOldData(meat_folder)

    def RemoveOldFetchedLogsBeforeNewFetching(self, meat_folder):
        meat_folder = meat_folder.lstrip('/')

        fully_fetched_folders_list = list(self.GetFetchedFolders(meat_folder))
        fully_fetched_folders_list.sort()
        logging.info('fully_fetched_folders_list: ' + str(fully_fetched_folders_list))

        date_folders_to_leave = fully_fetched_folders_list[-2:]  # we remove before new fetching, so we leave 2 and the 3rd is about to get created
        logging.info('date_folders_to_leave: ' + str(date_folders_to_leave))

        all_date_folders = self.GetAllDateFolders(meat_folder)
        folders_to_remove = all_date_folders - set(date_folders_to_leave)
        logging.info('folders_to_remove: ' + str(folders_to_remove))

        for folder in folders_to_remove:
            self.RemoveFolder(us_ci.MRPathJoin(meat_folder, folder))

    def RemoveBinsCIBuilds(self, meat_folder):
        import yt.wrapper as yt

        meat_folder = meat_folder.lstrip('/')

        fully_fetched_folders_list = list(self.GetFetchedFolders(meat_folder))
        fully_fetched_folders_list.sort()
        for complete_folder in fully_fetched_folders_list:
            date_us_build_logs_folder = us_ci.MRPathJoin(meat_folder, complete_folder, "user_sessions/build/logs")
            if yt.exists("//" + date_us_build_logs_folder.lstrip('/')):
                logs_subfolders = self.GetAllFolders(date_us_build_logs_folder)
                for logs_subdir in logs_subfolders:
                    create_sessions_test_trunk_dir = us_ci.MRPathJoin(date_us_build_logs_folder, logs_subdir, "1d/create_sessions_test-trunk")
                    if yt.exists("//" + create_sessions_test_trunk_dir.lstrip('/')):
                        self.RemoveOldSubfolders(create_sessions_test_trunk_dir, 20, [])

    def RefreshDataResources(self, daily_datetime, daily_logs_to_fetch, fast_datetime, fast_logs_to_fetch, working_folder, subfolder_name_date, base_task):
        logging.info('RefreshDataResources start')
        logging.info('trying to create data version resource')
        f = open('data_paths.txt', 'w+')
        f.write(us_ci.MRPathJoin(working_folder) + '\n')
        f.write(','.join(daily_logs_to_fetch) + '\n')
        f.write(daily_datetime.strftime('%Y-%m-%d:%H:%M') + '\n')
        f.write(','.join(fast_logs_to_fetch) + '\n')
        f.write(fast_datetime.strftime('%Y-%m-%d:%H:%M') + '\n')
        f.close()

        resource = base_task.create_resource('data_paths',
                                             'data_paths.txt',
                                             self.ctx[FreshdataResourceType.name],
                                             arch=None,
                                             attributes={
                                                 'debug': 'True' if self.ctx['debug_mode'] else 'False',
                                                 'created': str(subfolder_name_date)
                                             })
        res_id = resource.id
        base_task.mark_resource_ready(res_id)
        logging.info('created resource.id: ' + str(res_id))
        self.ctx['freshdata_resource_id'] = res_id

        logging.info('RefreshDataResources finish')

    def PlaceMRMark(self, path, data):
        cmd = [self.ctx['mapreduce_path'], '-server', self.MRServer, '-write', path]
        result, error = RunProcess(cmd, self.GetEnv(), data + '\n')

        logging.info('mapreduce stdout: ' + result)
        logging.info('mapreduce stderr: ' + error)

    def RefreshMeat(self, subfolder_name_date, meat_folder):
        self.PlaceMRMark(meat_folder + '/freshmeat', '\t' + meat_folder + '/' + self.StrDateInLogName(subfolder_name_date))

    def GetInputRawLogPath(self, ytLogName, date_time, period):
        import logs_lib
        path = logs_lib.getLogMRPath(ytLogName, date_time, period)
        prefix_appendable_name = path.lstrip('/')
        return self.ctx.get(OriginalLogsPathsPrefix.name, OriginalLogsPathsPrefix.default_value) + prefix_appendable_name

    def CalcTimestamps(self, date_time, hours_to_add_to_date_time):
        timedelta_to_begin = datetime.timedelta(hours=hours_to_add_to_date_time, minutes=0)
        timedelta_to_end = datetime.timedelta(hours=hours_to_add_to_date_time, minutes=self.ctx[MinCount.name])

        timestamp_begin = time.mktime((date_time + timedelta_to_begin).timetuple())
        timestamp_end = time.mktime((date_time + timedelta_to_end).timetuple())

        timestamp_begin = str(int(timestamp_begin))
        timestamp_end = str(int(timestamp_end))

        return (timestamp_begin, timestamp_end)

    def GetSmartMaybeFastInputRawLogPath(self, ytLogName, date_time, period, timedelta_for_fast):
        import logs_lib
        import time_periods
        import yt.wrapper as yt
        import time_util
        yt.config["proxy"]["url"] = "hahn"
        yt.config["token"] = self.yt_token
        yt.config['read_retries']['allow_multiple_ranges'] = True

        try:
            today_dt = time_util.datetime_round(datetime.datetime.now(), datetime.timedelta(days=1))
            for current_days_back in [0, 1, 2, 3]:
                cur_datetime = today_dt - datetime.timedelta(days=current_days_back) + timedelta_for_fast
                path_fast = logs_lib.getLogMRPath(ytLogName, cur_datetime, time_periods.Periods.FAST)
                if yt.exists("//" + path_fast.strip('/')):
                    prefix_appendable_name = path_fast.lstrip('/')
                    ts_begin, ts_end = self.CalcTimestamps(cur_datetime, hours_to_add_to_date_time=0)
                    return (self.ctx.get(OriginalLogsPathsPrefix.name, OriginalLogsPathsPrefix.default_value) + prefix_appendable_name, ts_begin, ts_end)
        except Exception as e:
            logging.info("Failed to fetch fast input: {}".format(e))

        return None

    def GetOutputRawLogPath(self, working_folder, ytLogName, date_time, period):
        if self.ctx[FetchForRemProcessesTesting.name]:
            # Need to maintain production's logs' folders' structure
            log_output_path_suffix = self.GetInputRawLogPath(ytLogName, date_time, period)
        else:
            log_output_path_suffix = ytLogName

        return us_ci.MRPathJoin(self.OneFetchFolder(working_folder), log_output_path_suffix)

    def MakeFetchByTimestampCmd(self, log, date_time, working_folder, ts_begin, ts_end, period, timedelta_if_fast_input=None, task=None):
        from us_processes import yamr2yt_log_names_converter  # noqa

        ytLogName = yamr2yt_log_names_converter.getYTLogName(log)

        if task:
            straceLogPath = task.log_path('strace_' + log + '.log')

        input_path = None

        if timedelta_if_fast_input is not None:
            smart_res = self.GetSmartMaybeFastInputRawLogPath(ytLogName, date_time, period, timedelta_if_fast_input)
            if smart_res is not None:
                input_path, ts_begin, ts_end = smart_res

        if input_path is None:
            input_path = self.GetInputRawLogPath(ytLogName, date_time, period)

        result = ['strace', '-tt', '-o', straceLogPath,
                  self.ctx['fetcher_path'], '-s', self.MRServer,
                  '--input', input_path,
                  '--output', self.GetOutputRawLogPath(working_folder, ytLogName, date_time, period),
                  '-b', ts_begin, '-e', ts_end]

        if ytLogName == 'direct_urls':
            result.append('--direct_urls')

        return result

    def OneFetchFolder(self, working_folder):
        return us_ci.MRPathJoin(working_folder, RAW_LOGS_FOLDER)

    def StrDateInLogName(self, date):
        return date.strftime('%Y-%m-%d')

    def CheckIfAlreadyFetched(self, working_folder):
        cmd = [self.ctx['yt_path'], 'exists', us_ci.MRPathJoin(self.OneFetchFolder(working_folder), FETCHED_MARK_TABLE_NAME)]

        result, error = RunProcess(cmd, self.GetEnv())

        return (result.strip() == 'true')

    def GetFetchedFolders(self, base_folder):
        cmd = ['yt', 'find', '--path', base_folder, '--name', FETCHED_MARK_TABLE_NAME, '--type', 'table']
        result, errror = RunProcess(cmd, self.GetEnv())
        fetched_tables = result.split('\n')

        folders = set()
        for table in fetched_tables:
            match = re.search('([0-9\\-]+)/{}/{}$'.format(RAW_LOGS_FOLDER, FETCHED_MARK_TABLE_NAME), table)
            if match is not None:
                folders.add(match.group(1))

        return folders

    def GetAllDateFolders(self, base_folder):
        def _is_date(folder):
            import datetime
            try:
                datetime.datetime.strptime(folder, "%Y-%m-%d")
                return True
            except Exception:
                return False
        folders = self.GetAllFolders(base_folder)
        return set([folder for folder in folders if _is_date(folder)])

    def GetAllFolders(self, base_folder):
        cmd = ['yt', 'list', '-l', base_folder]
        result, error = RunProcess(cmd, self.GetEnv())
        node_infos = [ni.strip().split() for ni in result.split('\n') if len(ni.strip()) > 0]

        folders = [ni[5] for ni in node_infos if ni[0] == 'map_node']

        return set(folders)

    def RemoveFolder(self, folder):
        logging.info('removing folder "' + folder + '"')
        cmd = ['yt', 'remove', '-r', folder]
        RunProcess(cmd, self.GetEnv())

    def SetReplicationFactor(self, folder, replication_factor):
        import yt.wrapper as yt

        for table in yt.search(GetYtFullPath(folder), node_type="table"):
            logging.info('setting replication_factor for table "' + table + '" to ' + str(replication_factor))
            cmd = ['yt', 'set', '{}/@replication_factor'.format(table), str(replication_factor)]
            RunProcess(cmd, self.GetEnv())
            if replication_factor < 3:
                logging.info('setting vital to false as replication_factor is low')
                cmd = ['yt', 'set', '{}/@vital'.format(table), '%false']
                RunProcess(cmd, self.GetEnv())

    def CreateRootPath(self, working_folder):
        folder = self.OneFetchFolder(working_folder)
        logging.info('Creating path ' + folder)

        cmd = [self.ctx['yt_path'], 'create', '-ri', 'map_node', folder]
        result, error = RunProcess(cmd, self.GetEnv())

        return (result.strip() == 'true')

    def CreatePathsIfNeeded(self, working_folder, logs_to_fetch, date_time, period):
        from us_processes import yamr2yt_log_names_converter  # noqa

        if not self.ctx[FetchForRemProcessesTesting.name]:
            # Do not need to maintain production's logs' folders' structure, so we do not need to create all these folders
            return

        for log in logs_to_fetch:
            ytLogName = yamr2yt_log_names_converter.getYTLogName(log)
            cmd = [self.ctx['yt_path'], 'create', '-ri', 'map_node', self.GetOutputRawLogPath(working_folder, ytLogName, date_time, period).rsplit('/', 1)[0]]
            result, error = RunProcess(cmd, self.GetEnv())

    def GetEnv(self):
        env = dict(os.environ)
        env['MR_RUNTIME'] = 'YT'
        env['YT_PREFIX'] = '//'
        env['YT_TOKEN'] = self.yt_token
        env['YT_POOL'] = self.ctx[YTPool.name]
        env['YT_PROXY'] = self.MRServer

        YTSpec = {
            "job_io": {
                "table_writer": {
                    "max_row_weight": 128 * 1024 * 1024
                }
            }
        }
        env['YT_SPEC'] = json.dumps(YTSpec)

        return env

    def GetFetchBinaryBuildName(self):
        return 'build_yt_fetching_by_timestamp'

    def GetFetchBinaryResourceIdName(self):
        return 'yt_fetching_by_timestamp_resource_id'

    def GetMapreduceBinaryBuildName(self):
        return 'build_mapreduce-yt'

    def GetMapreduceBinaryResourceIdName(self):
        return 'mapreduce-yt_resource_id'


class FetchDataTask(SandboxTask):
    type = "FETCHING_DATA_FOR_CREATE_SESSIONS_TESTS"
    cores = 1
    required_ram = 4096
    execution_space = 4096

    environment = [
        environments.PipEnvironment("yandex-yt", version='0.10.8'),
        environments.PipEnvironment('future'),
    ]

    input_parameters = [FetchToolId, YtToolId, MapreduceId, OriginalLogsPathsPrefix,
                        MapreduceCluster, FetchesFolder, FetchDate, LogsList, FetchForRemProcessesTesting,
                        RemServer, DebugMode, CreateFreshdataResource, FreshdataResourceType, FetchingMode,
                        FetchHourBeg, MinCount, TryFastInput, RowsToFetchCount, SubfolderName, YTToken,
                        YTTokenOwner, YTPool, CopyResources]

    def __init__(self, *args, **kwargs):
        SandboxTask.__init__(self, *args, **kwargs)

    def GetYTToken(self):
        owner = self.ctx.get(YTTokenOwner.name, '')
        if not owner:
            owner = self.owner
        return self.get_vault_data(owner, self.ctx[YTToken.name])

    def GetDailyLogsToFetch(self):
        if self.ctx.get(FetchForRemProcessesTesting.name, False):
            import sessions_config as logs_config
            return list(set(logs_config.GetAllDailyLogs()))

        if 'logs_list' not in self.ctx or self.ctx['logs_list'] == '':
            resource_object = common.rest.Client().resource.read(
                type=str(resource_types.USERDATA_LOGS_TO_CHECK),
                state=ctr.State.READY,
                limit=1
            )

            resource_path = self.sync_resource(resource_object['items'][0]['id'])
            with open(resource_path, 'r') as resource_content_file:
                resource_content = resource_content_file.read()

            logging.info('logs to fetch: ' + resource_content)

            daily_logs_to_fetch = resource_content.split(',')
        else:
            daily_logs_to_fetch = self.ctx['logs_list'].split(',')

        daily_logs_to_fetch = [item.strip() for item in daily_logs_to_fetch]
        if len(daily_logs_to_fetch) == 0:
            raise Exception('No logs to fetch')

        return daily_logs_to_fetch

    def GetFastLogsToFetch(self):
        if self.ctx.get(FetchForRemProcessesTesting.name, False):
            import sessions_config as logs_config
            return list(set(logs_config.GetAllFastLogs()))
        else:
            return []

    def GetFetchDate(self, daily_logs_to_fetch):
        from time_periods import Periods

        daily_datetime = self.ctx['fetch_date']
        if daily_datetime is None or daily_datetime == '':
            daily_datetime = self.Fetcher.CalculateSourcesDateTime(daily_logs_to_fetch, Periods.DAILY)
        else:
            daily_datetime = datetime.datetime.strptime(daily_datetime, '%Y-%m-%d')

        logging.info('date to fetch: ' + str(daily_datetime))

        return daily_datetime

    def GetFetchTime(self, fast_logs_to_fetch):
        from time_periods import Periods

        fast_datetime = self.Fetcher.CalculateSourcesDateTime(fast_logs_to_fetch, Periods.FAST)

        logging.info('time to fetch: ' + str(fast_datetime))

        return fast_datetime

    def DoFetching(self):
        from time_periods import Periods  # noqa

        daily_logs_to_fetch = self.GetDailyLogsToFetch()
        fast_logs_to_fetch = self.GetFastLogsToFetch()

        daily_datetime_to_fetch = self.GetFetchDate(daily_logs_to_fetch)
        fast_datetime_to_fetch = self.GetFetchTime(fast_logs_to_fetch)

        self.FETCHES_FOLDER = self.ctx['fetches_folder']
        subfolder_name_date = datetime.date.today() if self.ctx[SubfolderName.name] == SubfolderName.fetching_date else daily_datetime_to_fetch.date()
        self.WORKING_FOLDER = us_ci.MRPathJoin(self.FETCHES_FOLDER, self.Fetcher.StrDateInLogName(subfolder_name_date))

        if self.Fetcher.CheckIfAlreadyFetched(self.WORKING_FOLDER):
            logging.info('Already fetched')
        else:
            logging.info('Have to fetch')
            self.Fetcher.CreateRootPath(self.WORKING_FOLDER)
            self.Fetcher.Fetch(self.WORKING_FOLDER, daily_logs_to_fetch, daily_datetime_to_fetch, Periods.DAILY, self)
            self.Fetcher.Fetch(self.WORKING_FOLDER, fast_logs_to_fetch, fast_datetime_to_fetch, Periods.FAST, self)

            self.Fetcher.RefreshMeat(subfolder_name_date, self.FETCHES_FOLDER)

        if self.ctx['create_freshdata_resource']:
            self.Fetcher.RefreshDataResources(daily_datetime_to_fetch, daily_logs_to_fetch, fast_datetime_to_fetch, fast_logs_to_fetch,
                                              self.WORKING_FOLDER, subfolder_name_date, self)

    def on_execute(self):
        self.Fetcher = YtFetcher(self.ctx, yt_token=self.GetYTToken())

        if not self.ctx.get("preliminary_removal_done", False):
            self.Fetcher.RemoveWhatYouCanRemoveBeforeFetching(self.ctx['fetches_folder'])
            self.ctx["preliminary_removal_done"] = True

        if not self.ctx.get(MapreduceId.name) or not self.ctx.get(YtToolId.name) or not self.ctx.get(FetchToolId.name):
            self.Fetcher.BuildBinariesIfNeeded(self)

        if self.ctx.get(FetchToolId.name):
            self.ctx['fetcher_path'] = SandboxTask.sync_resource(self, self.ctx[FetchToolId.name])
        if self.ctx.get(YtToolId.name):
            self.ctx['yt_path'] = SandboxTask.sync_resource(self, self.ctx[YtToolId.name])
        if self.ctx.get(MapreduceId.name):
            self.ctx['mapreduce_path'] = SandboxTask.sync_resource(self, self.ctx[MapreduceId.name])

        if not self.ctx.get(FETCHING_DONE, False):
            with arc.Arc().mount_path(None, None, fetch_all=False) as arc_dir:
                ConfigureSysPath(arc_dir)
                self.DoFetching()
                self.ctx[FETCHING_DONE] = True


__Task__ = FetchDataTask
