# coding: utf-8

import os
import sys
import getpass
import logging
import time
import re
import urllib.request, urllib.parse, urllib.error
import tempfile
import traceback
import datetime
import platform
import calendar
import pytz
import shutil

from functools import wraps
from subprocess import PIPE, Popen
from collections import defaultdict, Counter
from dateutil.relativedelta import relativedelta

import requests
from unidiff import UnidiffParseError
from ylog.context import log_context
from django.conf import settings
from django import db
from django.utils.timezone import utc
from django.db.models.query_utils import Q
from django.db import connection, transaction, OperationalError
from django.utils.timezone import now, timedelta
from django.db import IntegrityError
from django.db.models import Count
from celery.exceptions import SoftTimeLimitExceeded
import yt.wrapper as yt
from intranet.dogma.dogma.core.utils import get_repository_model
from intranet.dogma.dogma.core.logic.users import EmailGuesser, get_user_data_by_email_from_staff
from intranet.dogma.dogma.core.dao.commits import (
    get_commits_map, get_all_repo_commits,
    create_commits_objects, get_commits_diff_by_batch,
    bind_commits_to_user, attach_commits_to_uid_by_emails,
    get_commits_for_file_statistics_aggregation,
    filter_duplicate_commits,
)
from intranet.dogma.dogma.core.errors.utils import get_parsed_error
from intranet.dogma.dogma.celery_app import app
from intranet.dogma.dogma.api.logic.query import prepare_date
from intranet.dogma.dogma.core.errors import BaseError, EmptyRepositoryError, NoRepositoryError
from .logic.commits import (
    slice_commits_in_batches,
    get_diff_data_for_batch,
    get_tracker_data, get_diff_map,
    get_batch_diff,
)
from .dao.source import all_sources, source_unavailable
from .dao.repo import make_repo_local
from .dao.clones import get_clone_by_id, get_clones_to_repair
from .logic import clone as clone_logic
from .models import Clone, Repo, Source, User, PushedCommit, ChangedFile, Node
from .utils import get_current_node, get_node_queue, locked_context, locked, get_random_node_queue
from .backends import get_backend
from .crawlers import get_crawler
from .logic.tasks_processing import mark_object_as_failed, mark_object_as_successful, retry_operational_error
from .logic.exception_processing import parse_exception
from .logic.celery_task import indexer_task
from .dao.repo import select_nodes_for_repo
from .dao.users import create_or_update_user_from_data, guess_main_user
from .logic.changed_file import get_changed_files_map
from .dao.changed_file import collect_extensions_data, get_changed_files_objects
from .logic.changed_file import update_extensions_data
from .dao.file_statistics import insert_file_statistics_data
from .abstract_repository import ChangedFilesMixin
from .logic.repos import make_valid_url

log = logging.getLogger(__name__)

if sys.stdout.isatty() and sys.argv == ['/dogma-main', 'shell']:
    username = getpass.getuser()
    if username != 'www-data':
        raise Exception('{} should not run this'.format(username))


def get_lock_key(clone_id, task_name='default'):
    return 'clone_action(%s)_%s' % (clone_id, task_name, )


def get_queue_name(queue, is_important):
    return 'important_queue' if is_important else queue


def clone_task(task):
    """
    Любое действие над клоном.
    """
    @app.task(time_limit=4 * 60 * 60)
    @wraps(task)
    def _wrapper(clone_id):
        lock_key = get_lock_key(clone_id, task.__name__)

        try:
            clone = Clone.objects.select_related().get(id=clone_id)
            source = clone.repo.source
            if source_unavailable(source):
                log.warning('Skip clone task for "%s", source unavailable', clone_id)
                return
        except Clone.DoesNotExist:
            log.error('Clone "%s" does not exist for task "%s"', clone_id, task)
            return

        with locked_context(lock_key) as aquired_lock:
            if not aquired_lock:
                log.warning('Task with key "%s" already running, skipping task', lock_key)
                return

            if get_current_node() != clone.node:
                return _wrapper.apply_async(
                    args=(clone.id,),
                    queue=get_node_queue(get_queue_name('clone', clone.repo.is_important),
                                         clone.node,
                                         )
                )

            with log_context(clone=clone.id, repository=str(clone.repo)):
                return task(clone)

    return _wrapper


@indexer_task
def node_info():
    st = os.statvfs(settings.DOGMA_ROOT)
    free = st.f_bavail * st.f_frsize
    total = st.f_blocks * st.f_frsize
    # used = (st.f_blocks - st.f_bfree) * st.f_frsize

    node = get_current_node(create_missing=True, no_cache=True)
    node.space_total = total
    node.space_available = free
    running_short = node.is_space_running_short()
    if running_short:
        node.enabled = False
    elif not node.enabled:
        node.enabled = True
    node.save()


@app.task(time_limit=8 * 60 * 60)  # 8 часов
def clone_repo(repo_id, force=False):
    lock_name = 'clone_repo_{}'.format(repo_id)
    with locked_context(lock_name) as acquired:
        if not acquired:
            log.debug('Lock "%s" is aquired', repo_id)
            return

        node = get_current_node()
        repo = Repo.objects.select_related('source').get(id=repo_id)

        task_fail = False
        with log_context(repo_id=repo_id, repo=str(repo), node=str(node),
                         task_id=clone_repo.request.id, timelimit=clone_repo.request.timelimit):

            log.info(f'Preparing to clone repo {repo.id}')

            if (
                repo.status == Repo.SYNC_STATUSES.fail
                and now() - repo.last_sync_fail_time < timedelta(hours=repo.sync_delay)
            ):
                log.info(f'Skipping clone repo {repo.id}, too much fails')
                return

            backend = get_backend(repo)

            try:
                clone = Clone.objects.get(node=node, repo=repo)

            except Clone.DoesNotExist:
                log.info(f'Trying to create clone for repo {repo.id} on node {node.id}')
                try:
                    clone = Clone.objects.create(
                        repo=repo,
                        node=node,
                        status=Clone.STATUSES.new,
                        space_required=0,
                        path=backend.path,
                    )
                    log.info('Successfully create clone for repo on node')
                except Exception as exc:
                    log.exception('Can not create clone for repo on node')
                    with transaction.atomic():
                        repo = Repo.objects.get(pk=repo_id)
                        trace = traceback.format_exc()
                        parsed_error = get_parsed_error(exc, trace)
                        mark_object_as_failed(repo, trace, parsed_error)
                        repo.save()
                    log.info(f'Saved repo {repo.id} data after task clone_repo')
                    return
            else:
                if not force:
                    log.warning('Clone for repo already exists on this node')

                    return

        with log_context(clone=clone.id, repo_id=repo_id, repo=str(repo), node=str(node), repository=str(clone.repo),
                         task_id=clone_repo.request.id, timelimit=clone_repo.request.timelimit):
            try:
                for _ in range(10):
                    try:
                        log.info('Starting clone process for repo on node')
                        kwargs = {}
                        if repo.source.code == 'svn':
                            kwargs['clone_branches'] = True
                        backend.clone(**kwargs)
                        count = backend.commits_in_default_branch()
                        log.info('Successfully finished clone process for repo on node')
                        break
                    except os.error as exc:
                        if "out of pty devices" in str(exc):
                            time.sleep(20)
                            continue
                        raise
                else:
                    raise ValueError('retries ended, sorry')
            except Exception as exc:
                task_fail = True
                try:
                    trace = traceback.format_exc()
                except Exception as exc:
                    trace = 'Got exception while formating trace: "{}"'.format(repr(exc))
                parsed_error = get_parsed_error(exc, trace)

                if isinstance(exc, EmptyRepositoryError):
                    log.info('No commits count for empty repository')

                elif isinstance(exc, BaseError):
                    try:
                        stdout, stderr = parse_exception(exc)
                        log.error(
                            'Failed clone on node %s:\nSTDOUT:\n%s\nSTDERR:\n%s',
                            node,
                            stdout,
                            stderr
                        )
                    except UnicodeDecodeError:
                        log.error('Got unicode error while parsing error for repo on node')
                else:
                    log.error('Failed clone repo on node: %s', repr(exc))
            else:
                log.info('Cloned repo')

            finally:
                db.connections.close_all()
                with transaction.atomic():
                    repo = Repo.objects.get(pk=repo_id)
                    clone = Clone.objects.get(repo=repo, id=clone.id)
                    if task_fail:
                        mark_object_as_failed(repo, trace, parsed_error)
                        clone_logic.delete_clone(clone)
                        repo.clone_attempt += 1
                    else:
                        mark_object_as_successful(repo)
                        clone.commits_count = count
                        clone.status = Clone.STATUSES.active
                        repo.create_commits_needed = True
                        repo.clone_attempt = 1

                    repo.save()
                    if task_fail:
                        return
                    clone.save()
                if repo.source.need_create_commits:
                    create_commits(clone.id)
                log.info('Saved clone and repo data after task clone_repo')


@app.task(time_limit=6 * 60 * 60, soft_time_limit=5.9 * 60 * 60)
@transaction.atomic
def update_source(source_id):
    lock_key = 'update_source(%s)' % source_id
    with locked_context(lock_key) as aquired_lock:
        if not aquired_lock:
            log.warning('Lock `%s` is acquired by another process', lock_key)

            return

        source = Source.objects.get(id=source_id)

        with log_context(source=str(source)):

            log.info(f'Started to parse source {source_id}')

            try:

                if not source.use_crawler:
                    mark_object_as_successful(source)
                    source.save()
                    log.info('Saved source "%s" data after task update_source', source)
                    return

                crawler = get_crawler(source)
                if not crawler:
                    log.error(
                        'Cant find crawler for source "%s"', source.name
                    )
                    return

                existing = set()
                new_repo_ids = []

                for new_repo in crawler.get_repos():
                    if new_repo.default_branch == '\xF0\x9F\x98\x93':
                        log.error(
                            'You are looking at repo "%s"', new_repo.vcs_name
                        )
                    is_new = False
                    try:
                        repo = Repo.objects.get(
                            source=source,
                            owner=new_repo.owner,
                            name=new_repo.name,
                        )
                    except Repo.DoesNotExist:
                        repo = Repo(source=source)
                        repo.on_remote = True
                        is_new = True

                    updated = False

                    connect_organization = new_repo.connect_organization
                    dict_repo = new_repo._asdict()
                    del dict_repo['connect_organization']

                    for attr, value in dict_repo.items():
                        old_value = getattr(repo, attr)
                        if old_value != value:
                            setattr(repo, attr, value)
                            updated = True
                    repo_id = repo.id
                    if updated or not repo_id:
                        if settings.IS_BUSINESS and not repo.url:
                            repo.url = make_valid_url(
                                source=source,
                                name=new_repo.name,
                                owner=new_repo.owner,
                            )
                        repo.save()
                    if not repo_id:
                        if settings.IS_BUSINESS:
                            orgs_id = connect_organization
                        else:
                            orgs_id = [settings.INTERNAL_DIR_ID]
                        if orgs_id:
                            repo.connect_organization.add(*orgs_id)

                    existing.add(repo.id)
                    if is_new:
                        new_repo_ids.append(repo.id)

                if not settings.IS_BUSINESS:
                    source.repo_set.exclude(id__in=list(existing)).update(on_remote=False)

            except Exception as exc:
                log.exception('Failed to update source "%s"', repr(exc))
                trace = traceback.format_exc()
                parsed_error = get_parsed_error(exc, trace)
                mark_object_as_failed(source, trace, parsed_error)

            else:
                log.info('Successful update of source "%s"', source.name)
                mark_object_as_successful(source)

            finally:
                source.save()
                log.info('Saved source "%s" data after task update_source', source)


@app.task
@locked('update_sources')
def update_sources():
    for source in Source.objects.filter(hidden=False):
        update_source.apply_async(args=(source.id,),
                                  queue=get_random_node_queue('clone'),
                                  )


@app.task
def update_repo_connections(repo_id):
    """
    Обновить метаинформацию про гитхаб репозитории.
    """
    repo = Repo.objects.select_related('source').get(id=repo_id)
    if repo.source.web_type != 'github':
        return 'not a github repo'

    crawler = get_crawler(repo.source)
    github = crawler.api_wrapper()

    gh_repo = github.repository(repo.owner, repo.name)
    if gh_repo.fork:
        gh_parent = gh_repo.parent

        repo.parent = Repo.objects.get(
            source=repo.source,
            owner=gh_parent.owner.login,
            name=gh_parent.name,
        )
        repo.save()


@indexer_task
def update_important_clones():
    log.info('Started update_important_clones task')
    clones = Clone.objects.on_current_node().filter(status__in=('active', 'fail'),
                                                    repo__is_important=True,
                                                    repo__source__status=Source.SYNC_STATUSES.success,
                                                    )
    for clone in clones:
        fetch_clone.apply_async(
            args=[clone.id],
            queue=get_node_queue('important_queue')
        )
    log.info('Finished sending update_important_clones tasks')


@app.task
@locked('check_active_repos')
def check_active_repos():
    """
    Проверяем репозиторий на наличие новых коммитов
    если новых коммитов в репозитории давно нет (более 180 дней)
    и при этом последний коммит создан более 10 дней
    назад (чтобы исключить случай, когда коммиты
    еще просто не успели создасться) - помечаем репозиторий
    как не активный (такие репозитории обновляются реже)
    """
    log.info('Started checking active repos process')
    repos_not_active = list()
    repos = Repo.objects.filter(is_active=True,
                                status=Repo.SYNC_STATUSES.success,
                                is_important=False,
                                )
    active_from = now() - timedelta(days=180)
    last_created_to = now() - timedelta(days=10)
    for repo in repos:
        last_commit = PushedCommit.objects.filter(repo=repo).order_by('-commit_time').first()
        if (last_commit and last_commit.commit_time < active_from and
                last_commit.created and last_commit.created < last_created_to):
                repos_not_active.append(repo.id)
                log.info('Add repo "%s" to not active repos', repo.id)

    Repo.objects.filter(id__in=repos_not_active).update(is_active=False)
    log.info('Finished checking active repos process, mark "%s" repos as not active',
             len(repos_not_active),
             )


@indexer_task
def update_clones(source_id=None, only_active=True, old_first=False, clone_modified_check=False):
    log.info('Started update_clones task')
    clones = Clone.objects.on_current_node().filter(status__in=('active', 'fail'),
                                                    repo__is_active=only_active,
                                                    repo__is_important=False,
                                                    )

    if source_id is not None:
        clones = clones.filter(repo__source_id=source_id)
    else:
        clones = clones.filter(repo__source__status=Source.SYNC_STATUSES.success)

    if not only_active:
        clones = clones.filter(repo__on_remote=True)

    if old_first:
        clones = clones.order_by("modified")

    log.info('Update clones of source "%s" called', source_id)
    clone_ids = clones.values_list('id', flat=True)
    for clone_id in clone_ids:
        fetch_clone.apply_async(
            args=[clone_id],
            queue=get_node_queue('clone'),
        )
    log.info('Finished sending update_clones_in_batch tasks')


@app.task(time_limit=60 * 60, soft_time_limit=60 * 40)
def update_clones_in_batch(clones_ids):
    log.info('Started fetch_clone tasks for batch')
    while clones_ids:
        clone_id = clones_ids.pop()
        with log_context(clone_id=clone_id, task='update_clones_in_batch'):
            try:
                retry_operational_error(fetch_clone, clone_id)
            except SoftTimeLimitExceeded:
                clones_ids.append(clone_id)
                log.warning('Update clones timed out, still need to update "%s"', len(clones_ids))
                update_clones_in_batch.apply_async(
                    args=[clones_ids],
                    queue=get_node_queue('clone'),
                    countdown=60 * 5
                )
                return
            except Exception as exc:
                log.error('Got unhandled error "%s"', repr(exc))
    log.info('Finished fetch_clone tasks for batch')


@clone_task
def fetch_clone(clone, clone_modified_check=True):
    if clone_modified_check and need_skip_fetch_clone(clone):
        return
    repo = clone.repo
    backend = get_backend(repo)
    task_fail = False
    with log_context(clone=str(clone), repo_id=repo.id, repo=str(repo), clone_id=clone.id, task='fetch_clone',
                     task_id=fetch_clone.request.id, timelimit=fetch_clone.request.timelimit):

        log.info('Fetching clone %s', clone.id)
        has_new_commits = None
        try:
            has_new_commits = backend.fetch()

        except Exception as exc:
            task_fail = True
            trace = traceback.format_exc()
            parsed_error = get_parsed_error(exc, trace)
            if isinstance(exc, EmptyRepositoryError):
                log.warning('Failed fetch clone %s, repository is empty', clone.id)
            elif isinstance(exc, NoRepositoryError):
                if settings.IS_BUSINESS:
                    clone_logic.delete_clone(clone)
                    clone_repo.apply_async(
                        args=[repo.id],
                        queue=get_node_queue('celery'),
                    )
                else:
                    log.warning('Failed fetch clone %s, repository is missing, mark repo as local', clone.id)
                    with transaction.atomic():
                        make_repo_local(repo)
                    return

            elif isinstance(exc, BaseError):
                log.exception('Failed fetch clone %s', clone.id)

            elif isinstance(exc, SoftTimeLimitExceeded):
                raise
        else:
            log.info('Finished fetching clone %s', clone.id)
            if not has_new_commits:
                log.info('No new commits found')

        finally:
            db.connections.close_all()
            with transaction.atomic():
                repo = Repo.objects.get(pk=repo.id)
                clone = Clone.objects.get(repo=repo, id=clone.id)
                if task_fail:
                    mark_object_as_failed(repo, trace, parsed_error)
                    clone.status = clone.STATUSES.fail
                else:
                    clone.status = clone.STATUSES.active
                    mark_object_as_successful(repo)
                    repo.on_remote = True
                if has_new_commits:
                    repo.create_commits_needed = True
                repo.save()
                clone.save()
            log.info('Saved clone and repo data after task fetch_clone')

            if not task_fail:
                if repo.create_commits_always or (
                            clone.status == Clone.STATUSES.active and
                            (has_new_commits or repo.create_commits_needed)
                ):
                    if repo.source.need_create_commits:
                        create_commits(clone.id)


def need_skip_fetch_clone(clone):
    hours_delta = 1 if clone.repo.is_important else 6
    if clone.modified > now() - timedelta(hours=hours_delta):
        log.info(
            'Skip fetch clone because already modified in last %s hour(s). Modified at %s', hours_delta, clone.modified
        )
        return True
    return False


@clone_task
def delete_clone(clone):
    clone_logic.delete_clone(clone)


@indexer_task
def run_gc_on_node(sources_ids):
    log.info('Started run_gc_on_node task')
    clones = Clone.objects.on_current_node().filter(
        status__in=('active', 'fail'),
    )
    if sources_ids is not None:
        clones = clones.filter(repo__source_id__in=sources_ids)

    clones = clones.values_list('id', flat=True)
    for clone in clones:
        git_gc.apply_async(
            args=[clone],
            queue=get_node_queue('clone'),
        )
    log.info('Finished sending git_gc tasks')


@clone_task
def git_gc(clone):
    #clone = get_clone_by_id(clone_id)
    if clone.node.id != get_current_node().id:
        return
    if not os.path.exists(clone.path):
        return

    with log_context(clone_id=clone.id, repository=str(clone.repo)):

        backend = get_backend(clone.repo)

        log.info('Starting git gc process')

        backend.run_git_command(
            'gc',
            '--aggressive',
            _cwd=clone.path,
        )
        log.info('git gc successfully finished')


@app.task
@locked('repair_failed')
def repair_failed(sources_ids=None):
    if not sources_ids:
        sources_ids = Source.objects.filter(hidden=False,
                                            status=Source.SYNC_STATUSES.success).values_list('id',
                                                                                             flat=True,
                                                                                             )
    for clone in get_clones_to_repair(sources_ids):
        repair_clone.apply_async(args=(clone.id,),
                                 queue=get_node_queue('clone', clone.node))


@clone_task
def repair_clone(clone):
    with log_context(clone_id=clone.id, repository=str(clone.repo)):
        log.info('Starting repairing process for clone')
        fetch_clone(clone.id)
        if clone.status == clone.STATUSES.active:
            log.info('Successfully fetch clone while repairing')
            return
        repo_id = clone.repo.id
        log.info('Deleting clone')
        clone_logic.delete_clone(clone)
        log.info('Started clone process after deleting')
        clone_repo(repo_id)


@app.task
@locked('clone_sources')
def clone_sources():
    for source in Source.objects.filter(hidden=False, status=Source.SYNC_STATUSES.success):
        clone_source.apply_async(args=(source.id,), queue=get_random_node_queue('clone'))


@app.task
def clone_source(source_id):
    lock_key = 'clone_source(%s)' % source_id
    with log_context(source_id=source_id, task='clone_source', task_id=clone_source.request.id,
                     timelimit=clone_source.request.timelimit):

        with locked_context(lock_key) as aquired_lock:
            if not aquired_lock:
                log.warning('Lock "%s" is acquired by another process', lock_key)

                return

            source = Source.objects.get(id=source_id)

            log.info(f'Started to clone source {source.id} "{source.code}"')

            delta = source.get_rate_delta()
            nodes_count = Node.objects.filter(enabled=True).count()
            filter_data = {'clones__isnull': True}
            if settings.IS_BUSINESS:
                filter_data['clone_attempt__lte'] = settings.DOGMA_CLONE_ATTEMPT_TO_FAIL

            while delta:
                try:
                    repo = source.repo_set.filter(**filter_data).order_by('?')[0]
                except IndexError:
                    if settings.IS_BUSINESS:
                        try:
                            repo = source.repo_set.annotate(
                                clones_count=Count('clones')).filter(
                                status='success', clones_count__lt=nodes_count,
                            ).order_by('?')[0]
                        except IndexError:
                            break
                    else:
                        break

                acceptable_volumes = select_nodes_for_repo(repo)
                if len(acceptable_volumes) > 0:
                    volume = acceptable_volumes[0]
                    log.info('Sending clone task for repo %s "%s"', repo.id, repo.full_name)
                    queue = 'clone'
                    if settings.IS_BUSINESS and repo.status == 'new':
                        queue = 'celery'
                    clone_repo.apply_async(args=(repo.id,), queue=get_node_queue(queue, volume))
                    delta -= 1
                else:
                    log.error('No acceptable volumes found for repo %s', repo.id)


@app.task
@locked('update_users')
def update_users():
    logger = logging.getLogger(
        '.'.join((__name__, 'fetch_users')),
    )

    session = requests.session()
    session.headers['Authorization'] = 'OAuth %s' % settings.DOGMA_OAUTH_TOKEN
    session.verify = settings.YANDEX_ROOT_CERTIFICATE

    next_url = 'https://{staff_api_host}/v3/persons?{query}'.format(
        staff_api_host=settings.DOGMA_STAFF_API_HOST,
        query=urllib.parse.urlencode({
            '_fields': ','.join(settings.STAFF_FIELDS),
            '_limit': '500',
        })
    )

    persons = []

    while next_url:
        retries_left = 5

        while True:
            try:
                response = session.get(next_url, allow_redirects=False,
                                       timeout=5)

                if response.status_code != 200:
                    raise requests.HTTPError(
                        "Invalid status code: %s" % response.status_code,
                        response=response,
                    )
            except Exception:
                logger.exception("Failed to fetch {}".format(next_url))

                if retries_left:
                    retries_left -= 1
                    time.sleep(1)
                    continue
                else:
                    raise

            data = response.json()
            persons.extend(data['result'])

            if 'links' in data and 'next' in data['links']:
                next_url = data['links']['next']
            else:
                next_url = None

            break

    for person in persons:
        create_or_update_user_from_data(person)

    make_users_txt()


@indexer_task
def make_users_txt():
    with tempfile.NamedTemporaryFile(delete=False, dir=settings.DOGMA_TEMP_DIR) as swap:
        for user in User.objects.order_by('login'):
            line = '{user.name} <{user.email}>\n'.format(user=user)
            swap.write(line.encode('utf-8'))
        swap.write(b'(no author) = unknown <unknown@unknown.unknown>\n')

    try:
        os.unlink(settings.DOGMA_SVN_USERS_PATH)
    except Exception:
        pass

    os.chmod(swap.name, 0o644)
    os.link(swap.name, settings.DOGMA_SVN_USERS_PATH)
    os.unlink(swap.name)


@app.task
@locked('count_commits_number')
def count_commits_number():
    for source in all_sources():
        source.update_commits_pct()


@app.task(time_limit=60 * 60 * 4, soft_time_limit=60 * 60 * 3)
def create_commits(clone_id):
    """
    Создает в базе объекты коммитов

    Перебирает весь репозиторий бранч за бранчем,
    находит еще не созаднные коммиты и создает их
    """
    db.connections.close_all()
    clone = get_clone_by_id(clone_id)

    if not clone:
        return  # клон был удален

    lock_key = get_lock_key(clone.repo.id, 'create_commits')
    with log_context(clone=clone.id, repository=str(clone.repo), repo_id=clone.repo.id,
                     task='create_commits'):
        with locked_context(lock_key) as acquired_lock:
            if not acquired_lock:
                log.warning('Task with key "%s" already running, skipping task', lock_key)
                return

            repository = clone.repo
            try:
                log.info('Starting create commits')
                repo_raw = get_repository_model(clone)
                commits_to_create = get_all_repo_commits(clone, repo_raw)
                if repository.id == 50016:
                    # XXX DOGMA-767
                    commit_map = {
                        pushed_commit.commit: pushed_commit
                        for pushed_commit in PushedCommit.objects.filter(
                            repo=clone.repo,
                            commit_time__gte=datetime.datetime.now()- datetime.timedelta(days=365)
                        )
                    }
                else:
                    commit_map = get_commits_map(repository)

                guesser = EmailGuesser()
                has_new = False
                pushed_commits = []
                log.info('Got all preliminary data for commits creation')

                for batch in slice_commits_in_batches(commits_to_create, slice_size=20):
                    if len(batch) == 0:
                        log.info('No commits in batch')
                        continue
                    batch_unique_commits = filter_duplicate_commits(commits=batch, repository=repository)
                    batch_diff = get_batch_diff(batch=batch_unique_commits, repo_raw=repo_raw)
                    log.info('Getting existing commit map')
                    existing_commits_diff = get_commits_diff_by_batch(batch_unique_commits)
                    existing_diff_map = get_diff_map(existing_commits_diff)
                    log.info('Getting data for commits creation')
                    diff_data = get_diff_data_for_batch(batch_diff=batch_diff,
                                                        diff_map=existing_diff_map,
                                                        )
                    log.info('Get diff data for batch')
                    changed_files_map = get_changed_files_map(batch_diff=batch_diff)
                    log.info('Get changed files for batch')
                    tracker_data = get_tracker_data(batch_unique_commits)
                    log.info('Got all data for commits creation')
                    has_new = True
                    for _ in range(2):
                        try:
                            log.info('Started creating commits in batch process')
                            create_commits_objects(commits=batch_unique_commits, diff_data=diff_data,
                                                   commit_map=commit_map, repo_raw=repo_raw,
                                                   guesser=guesser, repository=repository,
                                                   tracker_data=tracker_data,
                                                   changed_files_map=changed_files_map,
                                                   pushed_commits=pushed_commits,
                                                   )
                            log.info('Successfully finished creating commits in batch process')
                            if len(pushed_commits) > 10000:
                                pushed_commits = flush_pushed_commits_to_yt(repository, pushed_commits)
                        except OperationalError as exc:
                            db.connections.close_all()
                            log.error('Got db error "%s', repr(exc))
                        except SoftTimeLimitExceeded:
                            raise
                        except Exception as exc:
                            log.error('Got unrecoverable error "%s"', repr(exc))
                            break
                        else:
                            break
                log.info('Create all needed commits')
                flush_pushed_commits_to_yt(repository, pushed_commits)

                repository.create_commits_needed = False
                repository.save()
                if has_new:
                    repository.is_active = True
                    repository.update_commits_statistics()

            except SoftTimeLimitExceeded:
                log.warning('Create commits timed out, will continue task in 15 minutes')
                queue_name = get_queue_name('clone', clone.repo.is_important)
                create_commits.apply_async(
                    args=[clone.id],
                    queue=get_node_queue(queue_name, clone.node),
                    countdown=60 * 15
                )
                return
            except Exception as exc:
                trace = traceback.format_exc()
                parsed_error = get_parsed_error(exc, trace)
                mark_object_as_failed(repository, trace, parsed_error)
                log.exception('Unrecoverable error "%s" while create commits', repr(exc))


def flush_pushed_commits_to_yt(repository, pushed_commits):
    if settings.IS_BUSINESS or not settings.YT_AUDIT_PATH:
        return []
    yt_audit_commits = [
        {
            'commit_id': commit.commit,
            'commit_time': calendar.timegm(commit.commit_time.utctimetuple()),
            'author': commit.author.login,
            'committer': commit.committer.login,
            'lines_added': commit.lines_added,
            'lines_deleted': commit.lines_deleted,
            'repo_vcs_name': commit.repo.vcs_name,
            'repo_id': commit.repo.id,
            'source_host': platform.node(),
            'repo_vcs_type': repository.source.host,
            'branch_name': commit.branch_name,
        }
        for commit in pushed_commits
        if 'git-svn-id' not in commit.message
    ]

    if not yt_audit_commits:
        return []
    try:
        client = yt.YtClient(proxy=settings.YT_AUDIT_CLUSTER, token=settings.YT_AUDIT_TOKEN)
        yt_table_path = settings.YT_AUDIT_PATH + settings.YT_AUDIT_TABLE
        if not client.exists(yt_table_path):
            log.info('Table {} does not exist, creating'.format(yt_table_path))
            client.create(
                'table',
                yt_table_path,
                attributes={
                    'schema': [
                        {'name': 'commit_id', 'type': 'string'},
                        {'name': 'commit_time', 'type': 'datetime'},
                        {'name': 'author', 'type': 'string'},
                        {'name': 'committer', 'type': 'string'},
                        {'name': 'lines_added', 'type': 'uint32'},
                        {'name': 'lines_deleted', 'type': 'uint32'},
                        {'name': 'repo_vcs_name', 'type': 'string'},
                        {'name': 'repo_id', 'type': 'int32'},
                        {'name': 'source_host', 'type': 'string'},
                        {'name': 'repo_vcs_type', 'type': 'string'},
                        {'name': 'branch_name', 'type': 'string'},
                    ],
                    'optimize_for': 'scan'
                },
                recursive=True
            )

        log.info('Start flush to yt %s commits', len(yt_audit_commits))
        client.write_table(client.TablePath(yt_table_path, append=True), yt_audit_commits)
        return []
    except yt.YtError:
        log.exception('Failed to send YT audit')
        return pushed_commits


@clone_task
def move_clone_to_volume(clone):
    repo = clone.repo
    with log_context(repository=str(repo), repo_id=repo.id, clone_id=clone.id, clone=str(clone)):
        log.info('Started moving process')
        acceptable_volumes = select_nodes_for_repo(repo)
        if len(acceptable_volumes) == 0:
            log.error('No acceptable volumes found, aborting')
            return
        volume = acceptable_volumes[0]
        with log_context(volume=str(volume.hostname)):
            volume_move_path = '/storage/dogma/repos/{}/{}'.format(repo.source.code,
                                                                   repo.owner,
                                                                   )
            volume_full_path = '{}/{}'.format(volume_move_path,
                                              repo.name,
                                              )
            command_to_run = 'rsync -avz -e "ssh -o StrictHostKeyChecking=no" {} root@{}:{}'.format(
                clone.path,
                volume.hostname,
                volume_move_path,
            )

            attempt = 0
            while True:
                attempt += 1
                moving_process = Popen(command_to_run, stdout=PIPE, stderr=PIPE, shell=True)
                std_out, std_err = moving_process.communicate()
                response_code = moving_process.returncode
                if response_code == 0:
                    log.info('Successfully move data to volume, take "%s" attempts', attempt)
                    break
                else:
                    log.exception(('Got exception, status code: "%s" while moving to volume,'
                                  'retrying. Attempt "%s". STDOUT: "%s", STDERR: "%s"'),
                                  response_code, attempt, repr(std_out[-300:]), repr(std_err[-300:]))
                    time.sleep(5)
                    continue

            Clone.objects.create(node=volume,
                                 repo=repo,
                                 status=clone.status,
                                 path=volume_full_path,
                                 )
            log.info('Successfully create clone on volume')
            if repo.source.vcs_type == 'svn':
                log.info('Sending task for config change')
                change_svn_config.apply_async(args=[repo.id],
                                              queue=get_node_queue('clone', volume),
                                              )


@app.task
def change_svn_config(repo_id):
    lock_name = 'change_svn_config_{}'.format(repo_id)
    with locked_context(lock_name) as acquired:
        if not acquired:
            log.debug('Lock "%s" is aquired', lock_name)
            return
        repo = Repo.objects.select_related('source').get(id=repo_id)

        with log_context(repo_id=repo_id, repo=str(repo)):

            if repo.source.vcs_type != 'svn':
                log.error('Got repo with wrong source type')
                return

            backend = get_backend(repo)

            log.info('Starting change config process')

            backend.run_git_command(
                'config', '--local', 'svn.authorsfile', settings.DOGMA_SVN_USERS_PATH,
            )
            log.info('Successfully change clone config')


@app.task(time_limit=60 * 60 * 8, soft_time_limit=60 * 60 * 7)
def validate_users():
    """
    Проходим по всем пользователям не со стаффа (без uid)
    ищем пользователей со стаффа с таким же емейлом, если
    находим - привязываем коммиты к пользователю со стаффа
    а дубль удаляем

    Если пользователя в базе не нашли, но  получили информацию о существовании
    такого пользователе на стаффе - обновляем данные текущего пользователя,
    чтобы не перепривязывать коммиты
    """
    email_regex = re.compile(r'(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)')
    users_to_check = User.objects.filter(uid__isnull=True, email__contains='@')
    for user in users_to_check:
        if not email_regex.match(user.email):
            continue
        for _ in range(2):
            try:
                with log_context(user_email=repr(user.email), task='validate_users'):
                    main_user = User.objects.filter((Q(email=user.email) |
                                                     Q(other_emails__contains=user.email)),
                                                    uid__isnull=False,
                                                    from_staff=True,
                                                    ).first()

                    if main_user:
                        log.info('Found main user')
                        with transaction.atomic():
                            bind_commits_to_user(main_user, user)
                            user.delete()
                        log.info('Successfully bind commits to main user and delete duplicate')
                    else:
                        user_data = get_user_data_by_email_from_staff(email=user.email)
                        if user_data:
                            log.info('Got user data from staff')
                            try:
                                create_or_update_user_from_data(user_data, user_for_update=user)
                            except IntegrityError as exc:
                                if 'duplicate key value violates unique constraint "core_user_uid"' in str(exc):
                                    log.info('Found user with same uid, updating commits')
                                    uid = user_data['uid']
                                    attach_commits_to_uid_by_emails(uid, user.email)
                                    log.info('Successfully update commits data')
                            else:
                                log.info('Successfully update user data')
            except SoftTimeLimitExceeded:
                raise
            except OperationalError as exc:
                db.connections.close_all()
                log.error('Got db error "%s', repr(exc))
                continue
            except Exception:
                log.exception('Got unhandled exception while validate user "%s"', user.email)
                break
            else:
                break


@app.task(time_limit=60 * 60 * 8, soft_time_limit=60 * 60 * 7)
def update_files_changed_statistics(date=None):
    """
    Таска для обновления статистики об измененных файлов пользователем

    Есть возможность запустить обновление коммитов за определнный день,
    передав во входных параметрах дату в виде YYYY-MM-DD

    Автоматически таска запускается раз в день, получает все еще не агрегированный коммиты
    за текущий и два предыдущих дня (на случай если в очереди много задач и
    таска запускается позже чем дожна была), проходит по всем измененным файлам в этих коммитах
    и обновляет данные в табличке с агрегированными данными

    При этом если коммит был сделан пользователем без uid, пытается найти автора данного
    коммита с uid, в случае неудачи не агрегирует данные о коммите, чтобы в табличке с агрегированными
    данными были только данные пользователей, которые мы получили из апи соответствующих источников
    (staff-api, данные коннекта)

    """
    lock_name = 'update_files_changed_statistics_{}'.format(date)
    with locked_context(lock_name) as acquired:
        if not acquired:
            log.debug('Lock "%s" is aquired', lock_name)
            return

    with log_context(task='update_files_changed_statistics'):
        try:
            log.info('Started update files statistics task')
            if date:
                date = prepare_date(date)
                commits_to_process = get_commits_for_file_statistics_aggregation(date, exact_only=True)
            else:
                date = now()
                commits_to_process = get_commits_for_file_statistics_aggregation(date - timedelta(days=2))

            commits_to_process = commits_to_process.select_related('author').prefetch_related('changed_files')
            commits_to_process = commits_to_process.order_by('author_id')
            date_to_insert = date.replace(day=1)
            users_to_skip = set()

            for batch in slice_commits_in_batches(iter(commits_to_process), slice_size=200):
                users_data_to_update = set()
                if batch:
                    log.info('Started update batch process')
                    data_to_insert = defaultdict(Counter)
                    for commit in batch:
                        with log_context(author=commit.author.email, commit=commit.commit):
                            if commit.author_id not in users_to_skip:
                                user = None
                                try:
                                    user = guess_main_user(commit.author)
                                except User.DoesNotExist:
                                    pass

                                if not user:
                                    log.info('No main user found, skipping')
                                    users_to_skip.add(commit.author_id)
                                    continue

                                extensions_data = collect_extensions_data(commit)
                                if extensions_data:
                                    users_data_to_update.add(user.id)
                                    update_extensions_data(data_to_insert, extensions_data, user)

                    if data_to_insert:
                        log.info('Updating statistics for batch')
                        insert_file_statistics_data(data_to_insert,
                                                    date_to_insert,
                                                    users_data_to_update,
                                                    )
                        log.info('Successfully update statistics for batch')

                    PushedCommit.objects.filter(id__in=[
                        commit.id
                        for commit in batch
                    ]).update(aggregated=True)
                    log.info('Successfully update commits in batch')

            log.info('Finished update files statistics task')
        except SoftTimeLimitExceeded:
            raise
        except Exception as exc:
            log.exception('Unrecoverable error while updating batch statistics: "%s"', repr(exc))


@app.task()
def send_changed_files_task():
    log.info('Started sending tasks for changed files creation')
    for repo in Repo.objects.filter(clones__isnull=False):
        clone = repo.clones.filter(status='active').first()
        if clone:
            queue_name = get_queue_name('clone', clone.repo.is_important)
            create_changed_files_for_clone.apply_async(
                args=[clone.id],
                queue=get_node_queue(queue_name, clone.node)
            )
    log.info('Successfully finished sending tasks for changed files creation')


@app.task(time_limit=60 * 60 * 24, soft_time_limit=60 * 60 * 23)
def create_changed_files_for_clone(clone_id):
    clone = Clone.objects.select_related('repo').get(id=clone_id)
    lock_name = 'create_changed_files_{}'.format(clone.repo_id)
    with locked_context(lock_name) as acquired:
        if not acquired:
            log.info('Lock "%s" is already acquired', lock_name)
            return

    with log_context(clone_id=clone_id, repo_id=clone.repo_id, task='create_changed_files'):
        repo_raw = get_repository_model(clone)

        already_updated = set(
            PushedCommit.objects.filter(repo_id=clone.repo_id,
                                        create_changed_files__isnull=False).values_list('commit',
                                                                                        flat=True,
                                                                                        ),
        )
        commits_to_update = repo_raw.all_commits(exclude=set(already_updated))
        for batch in slice_commits_in_batches(commits_to_update, slice_size=10):
            try:
                if batch:
                    log.info('Started create changed files process for batch')
                    changed_files_to_create = []
                    commits_with_files = set()
                    commits_without_files = set()
                    batch_diff = get_batch_diff(batch=batch, repo_raw=repo_raw)
                    for commit in batch:
                        pushed_commit = PushedCommit.objects.filter(commit=commit.hex,
                                                                    create_changed_files__isnull=True
                                                                    ).first()
                        if not pushed_commit:
                            log.warning('Found not created commit for "%s"', clone.repo_id)
                            continue

                        try:
                            diff = batch_diff[commit.hex]
                            changed_files = diff.changed_files
                            changed_files_objects = get_changed_files_objects(changed_files, pushed_commit)
                        except (UnidiffParseError, LookupError) as exc:
                            log.warning('Got error while getting changed files "%s"', repr(exc))
                            commits_without_files.add(pushed_commit.id)
                            continue

                        if changed_files_objects:
                            changed_files_to_create.extend(changed_files_objects)
                            commits_with_files.add(pushed_commit.id)
                        else:
                            commits_without_files.add(pushed_commit.id)

                    with transaction.atomic():
                        if changed_files_to_create:
                            log.info('Creating changed files')
                            ChangedFile.objects.bulk_create(changed_files_to_create)
                            log.info('Successfully create changed files')
                        PushedCommit.objects.filter(id__in=commits_with_files).update(create_changed_files=True)
                        PushedCommit.objects.filter(id__in=commits_without_files).update(create_changed_files=False)

                    log.info('Successfully finished create changed files process for batch')
            except SoftTimeLimitExceeded:
                log.warning('Create changed files for clone timed out, will continue task in 5 minutes')
                queue_name = get_queue_name('clone', clone.repo.is_important)
                create_changed_files_for_clone.apply_async(
                    args=[clone.id],
                    queue=get_node_queue(queue_name, clone.node),
                    countdown=60 * 5
                )
                return
            except Exception as exc:
                log.exception('Unrecoverable error while creating changed files: "%s"', repr(exc))


@app.task(time_limit=60 * 60 * 24, soft_time_limit=60 * 60 * 23)
def recheck_file_extension(from_repo_id, to_repo_id):
    log.info('Started rechecking from "%s"', from_repo_id)
    checker = ChangedFilesMixin()
    for repo in Repo.objects.filter(pk__range=[from_repo_id, to_repo_id]):
        for pushed_commit in PushedCommit.objects.filter(create_changed_files=True, repo=repo):
            for changed_file in pushed_commit.changed_files.all():
                extension = checker.get_file_extension(changed_file.name)
                if changed_file.extension != extension:
                    changed_file.extension = extension
                    changed_file.save()
        log.info('Finished checking for "%s"', repo.id)


@app.task()
def send_correct_commit_time_task():
    log.info('Started sending tasks for setting correct commit time')
    for repo in Repo.objects.filter(clones__isnull=False):
        if PushedCommit.objects.filter(repo=repo,
                                       commit_time=datetime.datetime(
                                           2016, 9, 28, 6, 46, 37, 903971,
                                           tzinfo=utc)
                                       ).exists():
            clone = repo.clones.filter(status='active').first()
            if clone:
                set_correct_date_for_clone.apply_async(
                    args=[clone.id],
                    queue=get_node_queue('clone', clone.node)
                )
    log.info('Successfully finished sending tasks for setting correct commit time')


@app.task(time_limit=60 * 60 * 24, soft_time_limit=60 * 60 * 23)
def set_correct_date_for_clone(clone_id):
    clone = Clone.objects.select_related('repo').get(id=clone_id)
    lock_name = 'set_correct_date_for_commits_{}'.format(clone.repo_id)
    with locked_context(lock_name) as acquired:
        if not acquired:
            log.info('Lock "%s" is already acquired', lock_name)
            return

    with log_context(clone_id=clone_id, repo_id=clone.repo_id, task='set_correct_date_for_commits'):
        log.info('Started setting correct date for clone')
        repo_raw = get_repository_model(clone)

        already_updated = set(
            PushedCommit.objects.filter(repo_id=clone.repo_id).exclude(
                commit_time=datetime.datetime(
                    2016, 9, 28, 6, 46, 37, 903971,
                    tzinfo=utc)
            ).values_list('commit', flat=True,),
        )
        commits_to_update = repo_raw.all_commits(exclude=set(already_updated))
        for commit in commits_to_update:
            try:
                real_date = commit.commit_time
                pushed_commit = PushedCommit.objects.filter(commit=commit.hex).first()
                if pushed_commit and pushed_commit.commit_time != real_date:
                    pushed_commit.commit_time = real_date
                    pushed_commit.save()
            except SoftTimeLimitExceeded:
                log.warning('Set correct date for clone timed out, will continue task in 5 minutes')
                set_correct_date_for_clone.apply_async(
                    args=[clone.id],
                    queue=get_node_queue('clone', clone.node),
                    countdown=60 * 5
                )
                return
            except Exception as exc:
                log.exception('Unrecoverable error while setting correct date: "%s"', repr(exc))

        log.info('Finished setting correct date for clone')


@app.task
@locked('flush_lost_commits_to_yt')
def flush_lost_commits_to_yt(days_interval=2):
    log.info('Start flush lost commits to yt')

    yt_table_index = 0
    yt_table_path = settings.YT_AUDIT_PATH + settings.YT_AUDIT_TABLE
    commit_ids_by_repo = dict()
    yt_client = yt.YtClient(proxy=settings.YT_AUDIT_CLUSTER, token=settings.YT_AUDIT_TOKEN)

    log.debug('Start load previous data from yt')

    for row in yt_client.read_table(yt_client.TablePath(yt_table_path, columns=['commit_id', 'repo_id'], start_index=yt_table_index)):
        yt_repo_id = row['repo_id']
        if commit_ids_by_repo.get(str(yt_repo_id)) is None:
            commit_ids_by_repo[str(yt_repo_id)] = set()
        commit_ids_by_repo.get(str(yt_repo_id)).add(row['commit_id'])
        yt_table_index = yt_table_index + 1

    log.debug('Finish load previous data from yt')

    repos_for_process = [repo.id for repo in Repo.objects.raw(
            'SELECT id '
            'FROM core_repo '
            'WHERE ( '
            '   SELECT max(commit_time) > %s '
            '   FROM core_pushedcommit WHERE core_pushedcommit.repo_id = core_repo.id '
            ') '
            'ORDER BY id ', (
                datetime.datetime.now(pytz.FixedOffset(180)) - timedelta(days=days_interval),
            )
        )]

    log.debug('Repos for process count: %s', len(repos_for_process))

    skipped_repos = []
    result_commits = []
    commits_start_dt = datetime.datetime(2020, 3, 2, 0, 0, 0, 0, pytz.FixedOffset(180))
    for repo_id in repos_for_process:
        if PushedCommit.objects.filter(repo__id=repo_id, commit_time__gte=commits_start_dt).exists():
            commit_ids = commit_ids_by_repo.get(str(repo_id), [])

            db_commits = PushedCommit.objects.select_related('author', 'committer', 'repo', 'repo__source').filter(repo__id=repo_id, commit_time__gte=commits_start_dt)
            iteration_commits = [
                {
                    'commit_id': commit.commit,
                    'commit_time': calendar.timegm(commit.commit_time.utctimetuple()),
                    'author': commit.author.login,
                    'committer': commit.committer.login,
                    'lines_added': commit.lines_added,
                    'lines_deleted': commit.lines_deleted,
                    'repo_vcs_name': commit.repo.vcs_name,
                    'repo_id': commit.repo.id,
                    'source_host': platform.node(),
                    'repo_vcs_type': commit.repo.source.host,
                    'branch_name': commit.branch_name,
                }
                for commit in db_commits
                if 'git-svn-id' not in commit.message and not (commit.commit in commit_ids)
            ]

            if not iteration_commits:
                try:
                    repo = Repo.objects.get(pk=repo_id)
                    repo.last_yt_sync_time = now()
                    repo.save()
                except OperationalError:
                    pass
                continue
            lock_key = 'clone_action(' + str(repo_id) + ')_create_commits'
            with locked_context(lock_key) as acquired:
                if acquired:
                    for row in yt_client.read_table(yt_client.TablePath(
                        yt_table_path,
                        columns=['commit_id', 'repo_id'],
                        start_index=yt_table_index
                    )):
                        yt_table_index = yt_table_index + 1
                        yt_repo_id = row['repo_id']
                        if commit_ids_by_repo.get(str(yt_repo_id)) is None:
                            commit_ids_by_repo[str(yt_repo_id)] = set()
                        commit_ids_by_repo.get(str(yt_repo_id)).add(row['commit_id'])

                    commit_ids = commit_ids_by_repo.get(str(repo_id), [])
                    iteration_commits = [e for e in iteration_commits if e['commit_id'] not in commit_ids]
                    if iteration_commits:
                        log.info('Found %s lost commits for repository %s', len(iteration_commits), repo_id)
                        result_commits += iteration_commits
                        if len(result_commits) > 10000:
                            log.info('Start flush to yt %s lost commits', len(result_commits))
                            yt_client.write_table(yt_client.TablePath(yt_table_path, append=True), result_commits)
                            result_commits = []

                    try:
                        repo = Repo.objects.get(pk=repo_id)
                        repo.last_yt_sync_time = now()
                        repo.save()
                    except OperationalError:
                        pass

                else:
                    log.warning('Lock `%s` is acquired by another process', lock_key)
                    skipped_repos.append(repo_id)

    if result_commits:
        log.info('Start flush to yt %s lost commits', len(result_commits))
        yt_client.write_table(yt_client.TablePath(yt_table_path, append=True), result_commits)
    if skipped_repos:
        log.warning('Can\'t flush commits for repos: %s', skipped_repos)


def can_create_snapshot(d=None):
    if d is None:
        d = datetime.datetime.utcnow() + datetime.timedelta(hours=3)

    current_day_start = datetime.datetime(d.year, d.month, d.day, 0, 0, 0, 0, pytz.FixedOffset(180))
    clones = Clone.objects.filter(modified__lte=current_day_start, repo__is_active=True)
    if len(clones) > 0:
        log.info("Can not create snapshot: %d clones are not ready (repos %s)", len(clones), ", ".join([str(clone.repo.id) for clone in clones][:10]))
        return False

    sql = """
        SELECT core_repo.id, core_repo.created, core_repo.last_yt_sync_time, last_commit.created, last_commit.modified, last_commit.commit_time
        FROM core_repo
        LEFT JOIN core_pushedcommit AS last_commit
        ON last_commit.id = core_repo.contiguous_chain_of_commits_ends_at_id
        WHERE
            core_repo.is_active = true AND
            core_repo.contiguous_chain_of_commits_ends_at_id IS NOT NULL;
    """

    with connection.cursor() as c:
        c.execute(sql)
        res = c.fetchall()

    unflushed_repos = []

    for repo_id, repo_created, last_yt_sync_time, created, modified, commit_time in res:

        last_commit_time = created or modified or commit_time  # XXX DOGMA-766

        if repo_created > current_day_start:
            continue

        if last_commit_time is None:
            log.warn("Can not determine last commit time for repo %d", repo_id)
            continue

        if last_yt_sync_time is None:
            unflushed_repos.append(repo_id)
            continue

        if last_commit_time > last_yt_sync_time and last_commit_time < current_day_start:
            unflushed_repos.append(repo_id)
            continue

    if len(unflushed_repos) > 0:
        log.info("Can not create snapshot: %s repos are not flushed to yt (%s)", len(unflushed_repos), ", ".join([str(repo) for repo in unflushed_repos][:10]))
        return False

    return True


@app.task(time_limit=60 * 60)
@locked('create_snapshot')
def create_snapshot(d=None, prefix="auto_", create_link=True):
    if d is None:
        d = datetime.datetime.utcnow() + datetime.timedelta(hours=3)

    if not can_create_snapshot(d):
        return

    log.warning("Can create snapshot")

    yt_client = yt.YtClient(proxy=settings.YT_AUDIT_CLUSTER, token=settings.YT_AUDIT_TOKEN)
    yt_table_path = settings.YT_AUDIT_PATH + settings.YT_AUDIT_TABLE

    day_start = datetime.datetime(d.year, d.month, d.day, 0, 0, 0, 0, pytz.FixedOffset(180)) - datetime.timedelta(days=1)
    next_day_start_sec = calendar.timegm((day_start + datetime.timedelta(days=1)).utctimetuple())
    half_year_ago_start_sec = calendar.timegm((day_start + datetime.timedelta(days=1) + relativedelta(months=-6)).utctimetuple())
    postfix = day_start.strftime('%y-%m-%d')
    yt_new_table_path = settings.YT_AUDIT_PATH + prefix + 'commits_' + postfix
    if not yt_client.exists(yt_new_table_path):
        log.info('Table {} does not exist, creating'.format(yt_new_table_path))
        yt_client.create(
            'table',
            yt_new_table_path,
            attributes={
                'schema': [
                    {'name': 'commit_id', 'type': 'string'},
                    {'name': 'commit_time', 'type': 'datetime'},
                    {'name': 'author', 'type': 'string'},
                    {'name': 'committer', 'type': 'string'},
                    {'name': 'lines_added', 'type': 'uint32'},
                    {'name': 'lines_deleted', 'type': 'uint32'},
                    {'name': 'repo_vcs_name', 'type': 'string'},
                    {'name': 'repo_id', 'type': 'int32'},
                    {'name': 'source_host', 'type': 'string'},
                    {'name': 'repo_vcs_type', 'type': 'string'},
                    {'name': 'branch_name', 'type': 'string'},
                ],
                'optimize_for': 'scan'
            },
            recursive=True
        )
    else:
        log.info("Snapshot already exists")
        return

    source_commits = yt_client.read_table(yt_client.TablePath(yt_table_path))
    target_commits = [c for c in source_commits if int(c['commit_time']) < next_day_start_sec  # prev day
        and int(c['commit_time']) >= half_year_ago_start_sec
        and c['repo_vcs_type'] != 'bitbucket.browser.yandex-team.ru'  # not browser bb
        and c['repo_id'] != 143554  # not mobile/monorepo bb
        and c['repo_id'] != 147593
        and c['repo_id'] != 95983
        and c['repo_id'] != 35641
        and c['repo_id'] != 168378
        and c['repo_id'] != 116542
        and (c['repo_id'] != 50016 or c['branch_name'] != 'releases/experimental/mobile/master')]
    yt_client.write_table(yt_client.TablePath(yt_new_table_path, append=True), target_commits)
    if create_link:
        yt_client.link(target_path=yt_new_table_path, link_path=settings.YT_AUDIT_PATH + 'commits', force=True)


def get_manifests(source_id):
    sql = """
        SELECT
            core_repo.vcs_name,
            core_pushedcommit.commit,
            core_pushedcommit.commit_time,
            changedfile.name
        FROM
            core_repo
        JOIN
            core_source ON core_repo.source_id = core_source.id
        JOIN
            core_pushedcommit ON core_pushedcommit.repo_id = core_repo.id
        JOIN (
                SELECT
                    *
                FROM
                    core_changedfile
                WHERE
                    extension = 'json'
                AND
                    name LIKE '%/package.json'
            )
            changedfile ON changedfile.commit_id = core_pushedcommit.id
        WHERE
            core_source.id = {}
        AND
            core_repo.parent_id IS NULL
    """.format(source_id)

    with connection.cursor() as c:
        c.execute(sql)
        res = c.fetchall()

    parsed = []
    for line in res:
        parsed.append({
            "vcs_name": line[0],
            "commit": line[1],
            "commit_time": calendar.timegm(line[2].utctimetuple()),
            "filename": line[3],
        })

    return parsed


@app.task(time_limit=60 * 60)
@locked('dump_manifests')
def dump_manifests(source_id, table_data=None, create_link=True):
    yt_client = yt.YtClient(proxy=settings.YT_AUDIT_CLUSTER, token=settings.YT_AUDIT_TOKEN)

    yt_table_path = "//home/dogma/manifests/"

    d = datetime.datetime.utcnow() + datetime.timedelta(hours=3)
    day_start = datetime.datetime(d.year, d.month, d.day, 0, 0, 0, 0, pytz.FixedOffset(180)) - datetime.timedelta(days=1)

    postfix = day_start.strftime('%y-%m-%d')
    yt_table_name = "{}files_{}_{}".format(
        yt_table_path,
        str(source_id),
        postfix,
    )

    if not yt_client.exists(yt_table_name):
        if table_data is None:
            table_data = get_manifests(source_id)

        yt_client.create(
            'table',
            yt_table_name,
            attributes={
                'schema': [
                    {'name': 'commit', 'type': 'string'},
                    {'name': 'commit_time', 'type': 'datetime'},
                    {'name': 'vcs_name', 'type': 'string'},
                    {'name': 'filename', 'type': 'string'},
                ],
                'optimize_for': 'scan'
            },
            recursive=True
        )
        yt_client.write_table(
            yt_client.TablePath(yt_table_name, append=True),
            table_data
        )

    if create_link:
        source = Source.objects.get(pk=source_id)
        yt_client.link(target_path=yt_table_name, link_path=yt_table_path + 'files_' + source.code, force=True)



@app.task(time_limit=60 * 60)
@locked('dump_commits_to_yt')
def dump_commits_to_yt(d=None, days=1):
    logger = logging.getLogger(
        '.'.join((__name__, 'dump_commits_to_yt')),
    )

    if d is None:
        d = datetime.datetime.utcnow() + datetime.timedelta(hours=3)

    yt_dir_name = "//home/dogma/export"
    all_commits_yt_table_name = os.path.join(yt_dir_name, "all_commits")
    yt_client = yt.YtClient(proxy=settings.YT_AUDIT_CLUSTER, token=settings.YT_AUDIT_TOKEN)

    schema = [
        {"name": "timestamp", "type": "timestamp", "sort_order": "ascending"},
        {"name": "id", "type": "string", "sort_order": "ascending"},
        {"name": "repo", "type": "string", "sort_order": "ascending"},
        {"name": "login", "type": "string"},
        {"name": "email", "type": "string"},
        {"name": "message", "type": "string"},
        {"name": "paths", "type": "string"},
        {"name": "branch", "type": "string"},
        {"name": "mainLanguage", "type": "string"},
        {"name": "link", "type": "string"},
        {"name": "linesAdded", "type": "uint64"},
        {"name": "linesDeleted", "type": "uint64"},
        {"name": "cost", "type": "uint64"},
    ]

    attributes = {
        "dynamic": True,
        "optimize_for": "scan",
        "schema": schema,
    }

    if not yt_client.exists(all_commits_yt_table_name):
        yt_client.create("table", all_commits_yt_table_name, attributes=attributes)

    yt_client.mount_table(all_commits_yt_table_name)

    for k in range(days):

        current_day_start = datetime.datetime(d.year, d.month, d.day, 0, 0, 0, 0, pytz.FixedOffset(180))
        current_day_start = current_day_start - datetime.timedelta(days=k)
        prev_day_start = current_day_start - datetime.timedelta(days=1)

        date_commits_yt_table_name = os.path.join(yt_dir_name, "by_date", prev_day_start.strftime("%Y-%m-%d"))

        if yt_client.exists(date_commits_yt_table_name):
            continue

        sql = """

            SELECT
                core_pushedcommit.commit_id,
                core_pushedcommit.commit_time,
                core_pushedcommit.message,
                core_pushedcommit.branch_name,
                core_pushedcommit.lines_added,
                core_pushedcommit.lines_deleted,
                core_repo.id,
                core_repo.is_public,
                core_repo.vcs_name,
                core_source.code,
                author.login,
                author.email,
                author.from_staff,
                STRING_AGG(core_changedfile.name, ';')

            FROM core_pushedcommit

            JOIN core_repo ON core_pushedcommit.repo_id = core_repo.id
            JOIN core_source ON core_repo.source_id = core_source.id
            JOIN core_user AS author ON core_pushedcommit.author_id = author.id
            --JOIN core_user AS committer ON core_pushedcommit.committer_id = committer.id
            JOIN core_changedfile ON core_pushedcommit.id = core_changedfile.commit_id
            WHERE
                core_pushedcommit.created BETWEEN '%s 00:00:00'::timestamp AND '%s 00:00:00'::timestamp
            GROUP BY core_pushedcommit.id, core_repo.id, core_source.id, author.id
        """ % (
            prev_day_start.strftime("%Y-%m-%d"),
            current_day_start.strftime("%Y-%m-%d"),
        )

        logger.debug(sql)

        with connection.cursor() as c:
            c.execute(sql)
            data = c.fetchall()

        def guess_main_language(files):
            files = files.split(";")
            counts = {
                "python": 0,
                "c++": 0,
                "java": 0,
                "go": 0,
                "js": 0,
            }

            exts = {
                ".py": "python",
                ".cpp": "c++",
                ".h": "c++",
                ".java": "java",
                ".go": "go",
                ".js": "js",
            }
            for f in files:
                ext = os.path.splitext(f)[1]
                if ext in exts:
                    counts[exts[ext]] += 1

            max_count, lang = max(((v, k) for k, v in counts.items()))
            if max_count > 0:
                return lang

        repos = {}
        table_update = []

        for line in data:
            (
                commit_id,
                commit_time,
                message,
                branch_name,
                lines_added,
                lines_deleted,
                repo_id,
                is_public,
                vcs_name,
                vcs_type,
                login,
                email,
                from_staff,
                files
            ) = line

            if repo_id in repos:
                repo = repos[repo_id]
            else:
                repo = Repo.objects.get(pk=repo_id)
                repos[repo_id] = repo

            crawler = get_crawler(repo.source)

            record = {
                "timestamp": calendar.timegm(commit_time.utctimetuple()),
                "login": login if from_staff else None,
                "email": email,
                "id": commit_id,
                "message": message if is_public else "<commit to private repo>",
                "paths": files if is_public else "<commit to private repo>",
                "repo": vcs_type + ":" + vcs_name,
                "branch": branch_name,
                "linesAdded": lines_added,
                "linesDeleted": lines_deleted,
                "mainLanguage": guess_main_language(files),
                "link": crawler.get_commit_url(repo, commit_id),
                "cost": 0,
            }

            table_update.append(record)

        if k < 30 and not yt_client.exists(date_commits_yt_table_name):
            yt_client.create("table", date_commits_yt_table_name, attributes=attributes)
            yt_client.set(
                date_commits_yt_table_name + '/@expiration_time',
                (datetime.datetime.now() + datetime.timedelta(days=7)).isoformat()
            )
            yt_client.mount_table(date_commits_yt_table_name)


        logger.debug("Update size: " + str(len(table_update)))

        chunk_size = 90000
        for i in range(int(len(table_update) / chunk_size) + 1):

            chunk = table_update[i * chunk_size : (i + 1) * chunk_size]

            if k < 30:
                yt_client.insert_rows(date_commits_yt_table_name, chunk)

            yt_client.insert_rows(all_commits_yt_table_name, chunk)


@indexer_task
def refetch_failed_clones():
    failed_clones = Clone.objects.filter(
        status=Clone.STATUSES.fail,
        node=get_current_node()
    )

    for failed_clone in failed_clones:
        repo = failed_clone.repo
        node = failed_clone.node
        path = failed_clone.path
        failed_clone.delete()
        if path and os.path.exists(path):
            shutil.rmtree(path)
        clone_repo.apply_async(
            args=(repo.id,),
            queue=get_node_queue("clone", get_current_node())
        )


@app.task
def fetch_repos_without_clones():
    cloned_repos = Clone.objects.filter(status=Clone.STATUSES.active).values("repo_id").distinct()
    cloned_repos = set(x["repo_id"] for x in cloned_repos)

    active_repos = Repo.objects.filter(is_active=True)

    not_cloned_repos = [x for x in active_repos if x.id not in cloned_repos]

    for repo in not_cloned_repos:
        nodes = select_nodes_for_repo(repo)
        if len(nodes) > 0:
            clone_repo.apply_async(
                args=(repo.id,),
                queue=get_node_queue("clone", nodes[0])
            )
        else:
            log.error('No acceptable node found for repo "%s"', repo.id)


@app.task(time_limit=60 * 60)
@locked('dump_commits_to_activity')
def dump_commits_to_activity(d=None, days=1):
    logger = logging.getLogger(
        '.'.join((__name__, 'dump_commits_to_activity')),
    )

    if d is None:
        d = datetime.datetime.utcnow() + datetime.timedelta(hours=3, days=1)

    yt_dir_name = "//home/devtools-activity/sources/v1/commits"
    yt_client = yt.YtClient(proxy=settings.YT_AUDIT_CLUSTER, token=settings.YT_AUDIT_TOKEN)

    schema = [
        {"name": "timestamp", "type": "timestamp"},
        {"name": "id", "type": "string"},
        {"name": "repo", "type": "string"},
        {"name": "login", "type": "string"},
        {"name": "email", "type": "string"},
        {"name": "message", "type": "string"},
        {"name": "paths", "type": "string"},
        {"name": "branch", "type": "string"},
        {"name": "mainLanguage", "type": "string"},
        {"name": "link", "type": "string"},
        {"name": "linesAdded", "type": "uint64"},
        {"name": "linesDeleted", "type": "uint64"},
        {"name": "cost", "type": "uint64"},
    ]

    attributes = {
        "optimize_for": "scan",
        "schema": schema,
    }

    for k in range(days):

        current_day_start = datetime.datetime(d.year, d.month, d.day, 0, 0, 0, 0, pytz.FixedOffset(180))
        current_day_start = current_day_start - datetime.timedelta(days=k)
        prev_day_start = current_day_start - datetime.timedelta(days=1)

        date_commits_yt_table_name = os.path.join(yt_dir_name, prev_day_start.strftime("%Y-%m-%d"))
        date_commits_yt_table_name_temp = date_commits_yt_table_name + "_temp"

        if yt_client.exists(date_commits_yt_table_name_temp):
            logger.warn("Table %s exists, will be removed" % date_commits_yt_table_name_temp)
            yt_client.remove(date_commits_yt_table_name_temp)

        sql = """
            SELECT
                core_pushedcommit.commit_id,
                core_pushedcommit.commit_time,
                core_pushedcommit.message,
                core_pushedcommit.branch_name,
                core_pushedcommit.lines_added,
                core_pushedcommit.lines_deleted,
                core_repo.id,
                core_repo.is_public,
                core_repo.vcs_name,
                core_source.code,
                author.login,
                author.email,
                author.from_staff,
                STRING_AGG(core_changedfile.name, ';')
            FROM core_pushedcommit
            JOIN core_repo ON core_pushedcommit.repo_id = core_repo.id
            JOIN core_source ON core_repo.source_id = core_source.id
            JOIN core_user AS author ON core_pushedcommit.author_id = author.id
            --JOIN core_user AS committer ON core_pushedcommit.committer_id = committer.id
            JOIN core_changedfile ON core_pushedcommit.id = core_changedfile.commit_id
            WHERE
                core_pushedcommit.commit_time BETWEEN '%s 00:00:00'::timestamp AND '%s 00:00:00'::timestamp
            GROUP BY core_pushedcommit.id, core_repo.id, core_source.id, author.id
        """ % (
            prev_day_start.strftime("%Y-%m-%d"),
            current_day_start.strftime("%Y-%m-%d"),
        )

        logger.debug(sql)

        with connection.cursor() as c:
            c.execute(sql)
            data = c.fetchall()

        def guess_main_language(files):
            files = files.split(";")
            counts = {
                "python": 0,
                "c++": 0,
                "java": 0,
                "go": 0,
                "js": 0,
            }

            exts = {
                ".py": "python",
                ".cpp": "c++",
                ".h": "c++",
                ".java": "java",
                ".go": "go",
                ".js": "js",
            }
            for f in files:
                ext = os.path.splitext(f)[1]
                if ext in exts:
                    counts[exts[ext]] += 1

            max_count, lang = max(((v, k) for k, v in list(counts.items())))
            if max_count > 0:
                return lang

        repos = {}
        table_update = []

        for line in data:
            (
                commit_id,
                commit_time,
                message,
                branch_name,
                lines_added,
                lines_deleted,
                repo_id,
                is_public,
                vcs_name,
                vcs_type,
                login,
                email,
                from_staff,
                files
            ) = line

            if repo_id in repos:
                repo = repos[repo_id]
            else:
                repo = Repo.objects.get(pk=repo_id)
                repos[repo_id] = repo

            crawler = get_crawler(repo.source)

            record = {
                "timestamp": calendar.timegm(commit_time.utctimetuple()),
                "login": login if from_staff else None,
                "email": email,
                "id": commit_id,
                "message": message if is_public else "<commit to private repo>",
                "paths": files if is_public else "<commit to private repo>",
                "repo": vcs_type + ":" + vcs_name,
                "branch": branch_name,
                "linesAdded": lines_added,
                "linesDeleted": lines_deleted,
                "mainLanguage": guess_main_language(files),
                "link": crawler.get_commit_url(repo, commit_id),
                "cost": 0,
            }

            table_update.append(record)

        if not yt_client.exists(date_commits_yt_table_name_temp):
            yt_client.create("table", date_commits_yt_table_name_temp, attributes=attributes)

        logger.debug("Update size: " + str(len(table_update)))
        yt_client.write_table(yt_client.TablePath(date_commits_yt_table_name_temp, append=True), table_update)
        yt_client.move(date_commits_yt_table_name_temp, date_commits_yt_table_name, force=True)


@app.task
@locked('deactivate_archived_repos')
def deactivate_archived_repos():
    logger = logging.getLogger(
        '.'.join((__name__, 'deactivate_archived_repos')),
    )

    github_source = Source.objects.get(name='github')
    github_repos = Repo.objects.filter(
        source=github_source,
        is_active=True
    )

    github_repos_by_name = {
        repo.vcs_name: repo
        for repo in github_repos
    }

    github_repos_metadata = dict()

    crawler = get_crawler(github_source)

    for repo in github_repos:
        try:
            repo_metadata = crawler.api_wrapper().repository(repo.owner, repo.name).as_dict()
        except Exception:
            logger.exception("Failed to fetch {} metadata".format(repo.vcs_name))
        github_repos_metadata[repo.vcs_name] = repo_metadata

    new_archived_count = 0

    changed_repos = dict()

    for repo in github_repos:
        if repo.vcs_name in github_repos_metadata:
            if github_repos_metadata[repo.vcs_name].as_dict().get("archived"):
                new_archived_count += 1
                repo.is_active = False
                changed_repos[repo.id] = repo

            if github_repos_metadata[repo.vcs_name].fork and repo.parent is None:
                parent_repo_metadata = github_repos_metadata[repo.vcs_name].parent
                if parent_repo_metadata:
                    parent_vcs_name = '%s/%s' % (parent_repo_metadata.owner.login, parent_repo_metadata.name)
                    parent_repo = github_repos_by_name[parent_vcs_name]
                    repo.parent = parent_repo
                    changed_repos[repo.id] = repo

    logger.debug("Found %s archived repos" % new_archived_count)

    forks_by_id = dict()

    for repo_id, repo in changed_repos.items():
        if repo.parent:
            parent_id = repo.parent.id
            if parent_id in forks_by_id:
                forks_by_id[parent_id].append(repo_id)
            else:
                forks_by_id[parent_id] = [repo_id]

    def deactivate_forks(parent_id):
        changed_repos[parent_id].is_active = False
        if parent_id in forks_by_id:
            for fork_id in forks_by_id[parent_id]:
                deactivate_forks(fork_id)

    for repo_id in changed_repos.keys():
        deactivate_forks(repo_id)

    for _, repo in changed_repos.items():
        repo.save()
