import re
import logging

from datetime import datetime, timedelta

import luigi

import yt.wrapper as yt
from yt.common import datetime_to_string

from crypta.graph.soup.config.python import EDGE_TYPE, ID_TYPE
from crypta.graph.mrcc_opt.python.native import mrcc

from crypta.graph.soupy_indevice.lib import config
from crypta.graph.soupy_indevice.lib.link_counts import calculate_link_counts, calculate_link_count_percentiles
from crypta.graph.soupy_indevice.lib.preprocess import preprocess_soup, good_id_yql
from crypta.graph.soupy_indevice.lib.util import run_yql, timed
from crypta.graph.soupy_indevice.lib.luigi_tasks import BuildIndeviceParams, FinalizeIndevice


logger = logging.getLogger(__name__)


HUGE_INDEVICE = 20000


def mktmpdir(root="//tmp", expire_seconds=24 * 60 * 60):
    if not root.endswith("/"):
        root = root + "/"
    path = yt.find_free_subpath(root)
    expire_dt_str = datetime_to_string(datetime.utcnow() + timedelta(seconds=expire_seconds))

    # yt.mkdir can't create a directory with specified attributes,
    # it's also forbidden to set expiration_time after the creation inside transaction
    # so we're doing it the hard way
    attributes = {"expiration_time": expire_dt_str}
    yt.create("map_node", path, attributes=attributes)
    return path


@timed
def find_connected_components(inp, outp, transaction, uname="u", vname="v", compname="indevice_id"):

    yt_proxy = config.YT_PROXY
    yt_token = config.YT_TOKEN
    yt_pool = config.YT_POOL

    tmp_workdir = mktmpdir()

    converged = mrcc.find_components(
        proxy=yt_proxy,
        token=yt_token,
        transaction_id=str(transaction.transaction_id),
        pool=yt_pool,
        source=inp,
        destination=outp,
        workdir=tmp_workdir,
        previous_labels="",
        edge_first_fields=[uname],
        edge_second_fields=[vname],
        vertex_fields=[vname],
        component_field=compname,
        max_iterations_count=200,
        with_start_preparing=True,
        with_finish_preparing=True,
        script_name="soupy_indevice_mrcc",
    )

    if not converged:
        raise Exception("MRCC has not converged")


def cleanup(output_dir, keep_count, ytclient=None):
    if ytclient is None:
        ytclient = yt

    tbls = sorted(
        [x for x in ytclient.list(output_dir, absolute=True) if re.match(r"^.*/\d{4}-\d{2}-\d{2}$", x)], reverse=True
    )

    to_remove = tbls[keep_count:]

    for t in to_remove:
        print("Removing {}".format(t))
        ytclient.remove(t, recursive=True)


def indevice_edge_types():
    edge_types = [et for et in EDGE_TYPE.values() if et.Usage.SoupyIndevice]
    return edge_types


def soup_tables_from_dir(soup_dir, edge_types=None, fallback_date=None):
    if edge_types is None:
        edge_types = indevice_edge_types()

    soup_tables = [yt.ypath_join(soup_dir, EDGE_TYPE.name(et)) for et in edge_types]
    dates = [
        yt.get_attribute(yt.ypath_join(soup_dir, EDGE_TYPE.name(et)), "generate_date", None)
        for et in indevice_edge_types()
        if et.Usage.SoupUpdate
    ]
    dates = set([d for d in dates if d])

    if len(dates) == 0:
        if fallback_date is None:
            raise Exception("Couldn't find 'generate_date' attribute on any of soup tables")
        else:
            return soup_tables, fallback_date
    soup_delta = datetime.strptime(max(dates), "%Y-%m-%d") - datetime.strptime(min(dates), "%Y-%m-%d")
    if soup_delta > timedelta(days=3):
        raise ValueError("Soup delta overflow [{min!r} .. {max!r}]".format(min=min(dates), max=max(dates)))

    return soup_tables, min(dates)


def stream_soup_tables(soup_dir, stream_soup_dir, dates):
    stream_et = [et for et in indevice_edge_types() if et.LogSource.Name in config.STREAMING_LOGS]
    nonstream_et = [et for et in indevice_edge_types() if et.LogSource.Name not in config.STREAMING_LOGS]

    day_soup_dir = yt.ypath_join(soup_dir, "day")

    nonstream_soup_tables = []

    for date in sorted(dates):
        tables, nonstream_date = soup_tables_from_dir(
            yt.ypath_join(day_soup_dir, date), edge_types=nonstream_et, fallback_date=date
        )
        nonstream_soup_tables += tables

    stream_soup_tables, stream_date = soup_tables_from_dir(
        stream_soup_dir, edge_types=stream_et, fallback_date=nonstream_date
    )

    return nonstream_soup_tables + stream_soup_tables


@timed
def fast_match(stream_soup_tables, dates, indevice, sizes, bad_ids, bad_edges, output, out_sizes, out_bad_ids, tx):
    with yt.TempTable() as new_comps, yt.TempTable() as new_ids_indev, yt.TempTable() as new_comps_cc:
        run_yql(
            good_id_yql()
            + """
            PRAGMA SimpleColumns;
            PRAGMA yt.ExternalTx = '{tx}';
            PRAGMA yt.DisableJobSplitting;

            $indevice = '{indevice}';
            $soup_dates = ToSet(AsList('{dates}'));

            $day_soup_dev = (
                select
                    Identifiers::StrictNormalize(id1Type, id1) as id1,
                    id1Type,
                    Identifiers::StrictNormalize(id2Type, id2) as id2,
                    id2Type,
                    ListMin(dates) as date_begin,
                    ListMax(dates) as date_end
                from EACH(AsList('{stream_soup}'))
                WITH COLUMNS Struct<dates:List<String>?>
                where
                    $good_id(id1) and $good_id(id2)
                    and (
                        (`date` is not null)
                        or (not SetIsDisjoint(ToSet(dates), $soup_dates))
                    ) and $is_good_ls_day(sourceType, logSource, ListLength(dates))
            );

            $day_soup = (
                select
                    Identifiers::StrictNormalize(id1Type, id1) as id1,
                    id1Type,
                    Identifiers::StrictNormalize(id2Type, id2) as id2,
                    id2Type,
                    date_begin,
                    date_end
                from $day_soup_dev
                where Identifiers::IsValid(id1Type, id1)
                    AND Identifiers::IsValid(id2Type, id2)
            );

            $bad_ids = (
                select distinct id, id_type from `{bad_ids}`
                where reason != 'orphaned'
            );

            $bad_edges = (
                select distinct id1, id1Type, id2, id2Type
                from `{bad_edges}`
            );

            $day_soup_distinct = (
                select id1, id1Type, id2, id2Type, min(date_begin) as date_begin, max(date_end) as date_end
                from $day_soup
                group by id1, id1Type, id2, id2Type
            );

            $day_soup_with_indevice = (
                select
                    a.id1 as id1,
                    a.id1Type as id1Type,
                    a.id2 as id2,
                    a.id2Type as id2Type,
                    a.date_begin as date_begin,
                    a.date_end as date_end,
                    b.indevice_id as id1_indevice,
                    c.indevice_id as id2_indevice
                from $day_soup_distinct as a
                left join $indevice as b on a.id1 = b.id and a.id1Type = b.id_type
                left join $indevice as c on a.id2 = c.id and a.id2Type = c.id_type
                left only join $bad_ids as d on a.id1 = d.id and a.id1Type = d.id_type
                left only join $bad_ids as e on a.id2 = e.id and a.id2Type = e.id_type
                left only join $bad_edges as f on
                    a.id1 = f.id1 and
                    a.id2 = f.id2 and
                    a.id1Type = f.id1Type and
                    a.id2Type = f.id2Type
            );

            -- We only handle two cases here:
            -- 1. There's a known indevice_id for exactly one id of a pair: assign the same id to the other end
            $new_ids_indev = (
                select
                    id1 as id,
                    id1Type as id_type,
                    id2_indevice as indevice_id,
                    date_begin,
                    date_end
                from $day_soup_with_indevice
                where id1_indevice is null and id2_indevice is not null

                union all

                select
                    id2 as id,
                    id2Type as id_type,
                    id1_indevice as indevice_id,
                    date_begin,
                    date_end
                from $day_soup_with_indevice
                where id2_indevice is null and id1_indevice is not null
            );

            -- 1.5. There can be links to several old indevice-ids for an id: just select one of them
            $new_ids_indev_uniq = (
                select
                    id,
                    id_type,
                    min(indevice_id) as indevice_id,
                    min(date_begin) as date_begin,
                    max(date_end) as date_end
                from $new_ids_indev
                group by id, id_type
            );

            -- 2. There aren't any known indevice_ids for both ids of a pair: we'll generate something
            $new_components = (
                select t.*,
                (t.id1Type || '_' || t.id1) as u,
                (t.id2Type || '_' || t.id2) as v,
                from $day_soup_with_indevice as t
                left only join $new_ids_indev_uniq as a on t.id1 = a.id and t.id1Type = a.id_type
                left only join $new_ids_indev_uniq as b on t.id2 = b.id and t.id2Type = b.id_type
                where t.id1_indevice is null and t.id2_indevice is null
            );

            -- Cases when indevice_id is known for both ends of a pair should be handled by the slow full process,
            -- and therefore are just ignored here

            insert into `{new_ids_indev}` with truncate
            select id, id_type, indevice_id, min(date_begin) as date_begin, max(date_end) as date_end
            from $new_ids_indev_uniq
            group by id, id_type, indevice_id;

            insert into `{new_comps}` with truncate
            select * from $new_components;
        """.format(
                tx=tx.transaction_id,
                indevice=indevice,
                stream_soup="','".join(stream_soup_tables),
                dates="','".join(dates),
                new_ids_indev=new_ids_indev,
                bad_ids=bad_ids,
                bad_edges=bad_edges,
                new_comps=new_comps,
            )
        )

        find_connected_components(new_comps, new_comps_cc, tx)

        run_yql(
            """
            PRAGMA SimpleColumns;
            PRAGMA yt.ExternalTx = '{tx}';
            PRAGMA yt.InferSchema = '100';
            PRAGMA yt.DisableJobSplitting;

            $update_indevice_ids = (
                select distinct indevice_id
                from `{new_ids_indev}`
            );

            $update_indevice = (
                select
                    a.indevice_id as indevice_id,
                    a.id as id,
                    a.id_type as id_type,
                    a.date_begin as date_begin,
                    a.date_end as date_end,
                from `{indevice}` as a
                inner join $update_indevice_ids as b using (indevice_id)

                union all

                select * from `{new_ids_indev}`
            );

            $update_indevice_props = (
                select
                    indevice_id,
                    count(*) as size,
                    min(date_begin) as cluster_date_begin,
                    max(date_end) as cluster_date_end
                from $update_indevice
                group by indevice_id
            );

            $update_indevice_joined = (
                select a.*,
                       b.size as size,
                       b.cluster_date_begin as cluster_date_begin,
                       b.cluster_date_end as cluster_date_end
                from $update_indevice as a
                inner join $update_indevice_props as b using (indevice_id)
            );

            $new_comp_ids = (
                select id, id_type, v, min(date_begin) as date_begin, max(date_end) as date_end from (
                    select id1 as id, id1Type as id_type, date_begin, date_end, (id1Type || '_' || id1) as v
                    from `{new_comps}`
                    union all
                    select id2 as id, id2Type as id_type, date_begin, date_end, (id2Type || '_' || id2) as v
                    from `{new_comps}`
                )
                group by id, id_type, v
            );

            $new_comp_ids_with_cc = (
                select a.*, b.indevice_id as indevice_id
                from $new_comp_ids as a
                inner join `{new_comps_cc}` as b using (v)
            );

            $new_indevice_props = (
                select
                    indevice_id,
                    count(*) as size,
                    min(date_begin) as cluster_date_begin,
                    max(date_end) as cluster_date_end,
                    Digest::Md5Hex(
                        min_by(
                            id_type || '_' || id,
                            (date_begin ?? '1970-01-01') || id
                        )
                    ) as stable_indevice_id
                from $new_comp_ids_with_cc
                group by indevice_id
            );

            $new_indevice = (
                select
                    a.id as id,
                    a.id_type as id_type,
                    a.date_begin as date_begin,
                    a.date_end as date_end,
                    b.stable_indevice_id as indevice_id,
                    b.size as size,
                    b.cluster_date_begin as cluster_date_begin,
                    b.cluster_date_end as cluster_date_end
                from $new_comp_ids_with_cc as a
                inner join $new_indevice_props as b using (indevice_id)
            );

            insert into `{out_bad_ids}` with truncate
            select * from `{bad_ids}`
            union all
            select id, id_type, 'fast huge component' as reason
            from $new_indevice
            where size > {huge};

            insert into `{out_sizes}` with truncate
            select * from (
                select a.* from `{sizes}` as a
                left only join $update_indevice_ids as b using (indevice_id)
                union all
                select indevice_id, size from $update_indevice_props
                union all
                select stable_indevice_id as indevice_id, size from $new_indevice_props
                where size <= {huge}
            )
            order by size desc;

            $combined = (
                select a.*
                from `{indevice}` as a
                left only join $update_indevice_ids as b on a.indevice_id = b.indevice_id
                union all
                select * from $update_indevice_joined
                union all
                select * from $new_indevice
                where size <= {huge}  -- let accurate slow process split newly formed huge components
            );

            $uniq_output = (
                select
                    id,
                    id_type,
                    some(date_begin) as date_begin,
                    some(date_end) as date_end,
                    some(cluster_date_begin) as cluster_date_begin,
                    some(cluster_date_end) as cluster_date_end,
                    some(indevice_id) as indevice_id,
                    some(size) as size,
                    Ensure(null, count(*) == 1, 'each id must belong to a single indevice_id') as check_uniq
                from $combined
                group compact by id_type, id
            );

            insert into `{output}` with truncate
            select
                id,
                id_type,
                date_begin,
                date_end,
                cluster_date_begin,
                cluster_date_end,
                indevice_id,
                size
            from $uniq_output
            order by id_type, id;

            discard select check_uniq from $uniq_output;

        """.format(
                tx=tx.transaction_id,
                indevice=indevice,
                sizes=sizes,
                bad_ids=bad_ids,
                new_ids_indev=new_ids_indev,
                new_comps=new_comps,
                new_comps_cc=new_comps_cc,
                output=output,
                out_sizes=out_sizes,
                out_bad_ids=out_bad_ids,
                huge=HUGE_INDEVICE,
            )
        )

    set_output_date_attr(yt.get("{indevice}/@generate_date".format(indevice=indevice)), output)


@timed
def additional_split(soup_full, soup_cc, extra_bad_edges, size_thresh, soup_cc_out, bad_edges_out, ytclient, tx):
    with ytclient.TempTable() as soup_small, ytclient.TempTable() as soup_small_cc:

        # TODO: This should probably come from soup config (EdgeStrength == TRUSTED)
        #       We don't have the full edge type here at the moment, only id types
        trusted_idt_pairs = [
            (ID_TYPE.IDFA.Name, ID_TYPE.MM_DEVICE_ID.Name),
            (ID_TYPE.MM_DEVICE_ID.Name, config.FAKE_APP_IDTYPE),
            (ID_TYPE.GAID.Name, ID_TYPE.MM_DEVICE_ID.Name),
            (ID_TYPE.OAID.Name, ID_TYPE.MM_DEVICE_ID.Name),
        ]

        run_yql(
            """
            PRAGMA SimpleColumns;
            PRAGMA yt.ExternalTx = '{tx}';
            PRAGMA yt.DisableJobSplitting;

            -- soup_cc doesn't make correct output schema, so infer it
            PRAGMA yt.InferSchema;

            $soup_cc = '{soup_cc}';
            $size_thresh = {size_thresh};
            $extra_bad_edges = '{extra_bad_edges}';
            $soup_full = '{soup_full}';
            $soup_small = '{soup_small}';
            $soup_cc_out = '{soup_cc_out}';
            $bad_edges_out = '{bad_edges_out}';

            $cc_sizes = (
                select indevice_id, count(*) as size
                from $soup_cc
                group by indevice_id
            );

            $acceptable_cc = (
                select indevice_id from $cc_sizes
                where size < $size_thresh
            );

            $unacceptable_cc = (
                select indevice_id from $cc_sizes
                where size >= $size_thresh
            );

            $unacceptable_cc_ids = (
                select a.v as v
                from $soup_cc as a
                inner join $unacceptable_cc as b using (indevice_id)
            );

            $unacceptable_cc_edges = (
                select a.*
                from $soup_full as a
                inner join $unacceptable_cc_ids as b on a.u = b.v
                inner join $unacceptable_cc_ids as c on a.v = c.v
            );

            $unacceptable_cc_bad_edges = (
                select a.* from $unacceptable_cc_edges as a
                inner join $extra_bad_edges as b using(id1, id1Type, id2, id2Type)
                where not ({idtype_clause})
            );

            insert into $soup_cc_out with truncate
            select a.* from $soup_cc as a
            inner join $acceptable_cc as b using (indevice_id);

            insert into $bad_edges_out with truncate
            select t.*, 'p95 (additional)' as reason from $unacceptable_cc_bad_edges as t;

            insert into $soup_small with truncate
            select a.*
            from $unacceptable_cc_edges as a
            left only join $unacceptable_cc_bad_edges as b using (id1, id2, id1Type, id2Type);
        """.format(
                tx=tx.transaction_id,
                soup_cc=soup_cc,
                size_thresh=size_thresh,
                soup_cc_out=soup_cc_out,
                bad_edges_out=bad_edges_out,
                extra_bad_edges=extra_bad_edges,
                soup_full=soup_full,
                soup_small=soup_small,
                idtype_clause=" or ".join(
                    ["(a.id1Type='{}' and a.id2Type='{}')".format(x[0], x[1]) for x in trusted_idt_pairs]
                ),
            )
        )

        find_connected_components(soup_small, soup_small_cc, tx)

        if (
            ytclient.row_count(soup_small_cc) > 0
        ):  # The following yql fails if soup_cc_small is empty (schema can't be inferred)
            run_yql(
                """
                PRAGMA SimpleColumns;
                PRAGMA yt.ExternalTx = '{tx}';
                PRAGMA yt.InferSchema;
                PRAGMA yt.DisableJobSplitting;

                insert into `{soup_cc_out}`
                select * from `{soup_small_cc}`;
            """.format(
                    tx=tx.transaction_id, soup_cc_out=soup_cc_out, soup_small_cc=soup_small_cc
                )
            )


@timed
def build_link_count_percentiles(soup_tables, idstorage, output_table):
    with yt.Transaction() as tx, yt.TempTable() as link_counts, yt.TempTable() as preprocessed_pairs, yt.TempTable() as _bad_ids:

        preprocess_soup(soup_tables, idstorage, preprocessed_pairs, _bad_ids, tx)
        calculate_link_counts(preprocessed_pairs, link_counts, tx)
        calculate_link_count_percentiles(link_counts, output_table, tx)


@timed
def filter_by_id_info(preprocessed_pairs, out_bad_ids, out_bad_edges, tx):
    run_yql(
        """
        PRAGMA SimpleColumns;
        PRAGMA yt.ExternalTx = '{tx}';
        PRAGMA yt.DisableJobSplitting;

        $preproc_pairs = '{preprocessed_pairs}';

        -- This calculates all the models visible for the identifier:
        --     either on the identifier itself (idfa/gaid/oaid/mm_device_id)
        -- or on the identifiers directly adjacent to it (uuid/yandexuid)
        $distinct_models = (
            select distinct id1, id1Type, manufacturer, model from (
                -- id1
                select
                    id1,
                    id1Type,
                    id1_manufacturer as manufacturer,
                    id1_model as model
                from $preproc_pairs
                where id1_model is not null

                union all

                -- id2
                select
                    id2 as id1,
                    id2Type as id1Type,
                    id2_manufacturer as manufacturer,
                    id2_model as model
                from $preproc_pairs
                where id2_model is not null

                union all

                -- id1 adjacent
                select
                    id1,
                    id1Type,
                    id2_manufacturer as manufacturer,
                    id2_model as model
                from $preproc_pairs
                where id2_model is not null

                union all

                -- id2 adjacent
                select
                    id2 as id1,
                    id2Type as id1Type,
                    id1_manufacturer as manufacturer,
                    id1_model as model
                from $preproc_pairs
                where id1_model is not null
            )
        );

        $bad_ids = (
            select id1 as id, id1Type as id_type, 'many models' as reason
            from $distinct_models
            group by id1, id1Type
            having count(*) > 2  -- It seems that for many phone models there are exactly 2 variants of the model,
                                 -- with and without firmware version, "Galaxy A6" and "SM-A600G" for instance.
        );

        insert into `{out_bad_ids}` with truncate
        select * from $bad_ids;

        insert into `{out_bad_edges}` with truncate
        select a.*, 'os mismatch' as reason from $preproc_pairs as a
        where id1_os is not null and id2_os is not null and id1_os != id2_os

        union all

        select a.*, 'no mobile distr ui' as reason from $preproc_pairs as a
        where (a.id1Type = 'distr_ui' and id2_os in ('android', 'ios')) or
              (a.id2Type = 'distr_ui' and id1_os in ('android', 'ios'));
    """.format(
            tx=tx.transaction_id,
            preprocessed_pairs=preprocessed_pairs,
            out_bad_ids=out_bad_ids,
            out_bad_edges=out_bad_edges,
        )
    )


@timed
def filter_yp_cookie(preprocessed_pairs, link_counts, out_bad_edges, tx):
    run_yql(
        """
        PRAGMA SimpleColumns;
        PRAGMA yt.ExternalTx = '{tx}';
        PRAGMA yt.DisableJobSplitting;

        $link_count_totals = (
            select id1, id1Type, sum(c) as c
            from `{link_counts}`
            group by id1, id1Type
        );

        insert into `{out_bad_edges}`
        select a.*, 'single day yp' as reason
        from `{preprocessed_pairs}` as a
        inner join $link_count_totals as b on a.id1 = b.id1 and a.id1Type = b.id1Type
        inner join $link_count_totals as c on a.id2 = c.id1 and a.id2Type = c.id1Type
        where
            a.date_begin = a.date_end and
            (a.sourceType in ('access-yp-did', 'watch-yp-did-android', 'watch-yp-did-ios')) and
            ((a.id1Type = 'yandexuid' and b.c > 1) or (a.id2Type = 'yandexuid' and c.c > 1))
    """.format(
            tx=tx.transaction_id,
            link_counts=link_counts,
            preprocessed_pairs=preprocessed_pairs,
            out_bad_edges=out_bad_edges,
        )
    )


@timed
def filter_single_day_cookies(preprocessed_pairs, out_bad_edges, tx):
    appm_idtypes = [x.Name for x in [ID_TYPE.IDFA, ID_TYPE.GAID, ID_TYPE.OAID, ID_TYPE.MM_DEVICE_ID, ID_TYPE.UUID]] + [
        config.FAKE_APP_IDTYPE
    ]
    run_yql(
        """
        PRAGMA yt.ExternalTx = '{tx}';
        PRAGMA yt.DisableJobSplitting;

        $pairs = '{preprocessed_pairs}';

        $yuid_edges = (
            select * from $pairs
            where (id1Type = 'yandexuid' and id2Type in ('{appm_idtypes}') or
                   id2Type = 'yandexuid' and id1Type in ('{appm_idtypes}'))
        );

        $suspicious_edges = (
            select * from $yuid_edges
            where date_begin = date_end
        );

        $ok_edges = (
            select id1, id1Type, id2, id2Type from $yuid_edges
            where date_begin != date_end
        );

        $ok_edges_yuids = (
            select distinct id, id_type from (
                select id1 as id, id1Type as id_type from $ok_edges where id1Type = 'yandexuid'
                union all
                select id2 as id, id2Type as id_type from $ok_edges where id2Type = 'yandexuid'
            )
        );

        $suspiciouser_edges = (
            SELECT a.*
            FROM (SELECT * FROM $suspicious_edges WHERE id1Type == "yandexuid") AS a
            LEFT ONLY JOIN $ok_edges_yuids AS b
            ON (a.id1 == b.id)

            UNION ALL

            SELECT a.*
            FROM (SELECT * FROM $suspicious_edges WHERE id2Type == "yandexuid") AS a
            LEFT ONLY JOIN $ok_edges_yuids AS b
            ON (a.id2 == b.id)
        );

        $suspiciouser_edges_one_day = (
            -- Single day edges for yuids that have multi-day edges
            SELECT a.*
            FROM (SELECT * FROM $suspicious_edges WHERE id1Type == "yandexuid") AS a
            LEFT SEMI JOIN $ok_edges_yuids AS b
            ON (a.id1 == b.id)

            UNION ALL

            -- Single day edges for yuids that have multi-day edges (when that yuid is in id2)
            SELECT a.*
            FROM (SELECT * FROM $suspicious_edges WHERE id2Type == "yandexuid") AS a
            LEFT SEMI JOIN $ok_edges_yuids AS b
            ON (a.id2 == b.id)
        );

        insert into `{out_bad_edges}` with truncate

        select a.*, 'single day yuid' as reason
        from $suspiciouser_edges_one_day as a

        union all

        -- Single day edges for yuids that only have single day edges: discard all but one edge
        select * from (
            select ROW_NUMBER() over w as rn, a.*, 'single day yuid 2' as reason
            from $suspiciouser_edges as a
            where a.id1Type = 'yandexuid'
            window w as (
                partition by id1, id1Type
                order by date_begin, date_end
            )

            union all

            select ROW_NUMBER() over w as rn, a.*, 'single day yuid 2' as reason
            from $suspiciouser_edges as a
            where a.id2Type = 'yandexuid'
            window w as (
                partition by id2, id2Type
                order by date_begin, date_end
            )
        )
        where rn != 1;
    """.format(
            tx=tx.transaction_id,
            preprocessed_pairs=preprocessed_pairs,
            out_bad_edges=out_bad_edges,
            appm_idtypes="','".join(appm_idtypes),
        )
    )


@timed
def filter_by_percentiles(
    preprocessed_pairs, link_counts, link_count_percentiles, out_bad_edges, out_bad_edges_extra, tx
):
    run_yql(
        """
        PRAGMA SimpleColumns;
        PRAGMA yt.ExternalTx = '{tx}';
        PRAGMA yt.DisableJobSplitting;

        $pairs = '{preprocessed_pairs}';
        $link_counts = '{link_counts}';
        $link_count_perc = '{link_count_percentiles}';

        $reasonable_value = 20;

        $select_thresh = ($p100, $p99999, $p9999, $p999) -> {{
            return case
                when $p999 > $reasonable_value then ($p999, 'p999')
                when $p9999 > $reasonable_value then ($p9999, 'p9999')
                when $p99999 > $reasonable_value then ($p99999, 'p99999')
                else ($p100, 'p100')
            end;
        }};

        $join = (
            select a.*,
                   b.c as id1_count,
                   d.p95 as id1_p95,
                   $select_thresh(d.p100, d.p99999, d.p9999, d.p999) as id1_thresh,
                   c.c as id2_count,
                   e.p95 as id2_p95,
                   $select_thresh(e.p100, e.p99999, e.p9999, e.p999) as id2_thresh,
            from $pairs as a
            inner join $link_counts as b on a.id1 = b.id1 and a.id1Type = b.id1Type and a.id2Type = b.id2Type
            inner join $link_counts as c on a.id2 = c.id1 and a.id2Type = c.id1Type and a.id1Type = c.id2Type
            inner join $link_count_perc as d on a.id1Type = d.id1Type and a.id2Type = d.id2Type
            inner join $link_count_perc as e on a.id2Type = e.id1Type and a.id1Type = e.id2Type
        );

        $bad_edges_extra = (
            select id1, id1Type, id2, id2Type, 'p95' as reason from $join
            where id1_count > id1_p95 or id2_count > id2_p95
        );

        $bad_edges = (
            select
                id1,
                id1Type,
                id2,
                id2Type,
                case
                    when id1_count > id1_thresh.0
                    then CAST(id1_count AS String) || ' > ' || CAST(id1_thresh.0 AS String) || '(' || CAST(id1_thresh.1 AS String) || ')'
                    when id2_count > id2_thresh.0
                    then CAST(id2_count AS String) || ' > ' || CAST(id2_thresh.0 AS String) || '(' || CAST(id2_thresh.1 AS String) || ')'
                    else 'wtf'
                end as reason
            from $join
            where id1_count > id1_thresh.0 or
                  id2_count > id2_thresh.0
        );

        insert into `{out_bad_edges_extra}` with truncate
        select * from $bad_edges_extra;

        insert into `{out_bad_edges}` with truncate
        select * from $bad_edges;
    """.format(
            tx=tx.transaction_id,
            preprocessed_pairs=preprocessed_pairs,
            link_counts=link_counts,
            link_count_percentiles=link_count_percentiles,
            out_bad_edges_extra=out_bad_edges_extra,
            out_bad_edges=out_bad_edges,
        )
    )


@timed
def prepare_for_cc(preprocessed_pairs, bad_ids, bad_edges, insignificant_edges, soup_full, tx):  # input  # output
    yql = """
        PRAGMA SimpleColumns;
        PRAGMA yt.ExternalTx = '{tx}';
        PRAGMA yt.DisableJobSplitting;

        $preproc_pairs = '{preprocessed_pairs}';
        $bad_ids = AsList('{bad_ids}');
        $bad_edges = AsList('{bad_edges}');
        $insignificant_edges = '{insignificant_edges}';

        INSERT INTO @distinct_bad_ids
        select distinct id, id_type from EACH($bad_ids)
        ORDER BY id, id_type;

        INSERT INTO @distinct_bad_edges
        select distinct id1, id1Type, id2, id2Type from EACH($bad_edges)
        ORDER BY id1, id2, id1Type, id2Type;

        COMMIT;

        $soup_full = (
            select a.*
            from $preproc_pairs as a
            left only join @distinct_bad_edges as d on a.id1 = d.id1 and
                                                  a.id2 = d.id2 and
                                                  a.id1Type = d.id1Type and
                                                  a.id2Type = d.id2Type
            left only join @distinct_bad_ids as b on a.id1 = b.id and a.id1Type = b.id_type
            left only join @distinct_bad_ids as c on a.id2 = c.id and a.id2Type = c.id_type
        """

    if insignificant_edges:
        yql += """
            left only join $insignificant_edges as e on a.id1 = e.id1 and
                                                        a.id2 = e.id2 and
                                                        a.id1Type = e.id1Type and
                                                        a.id2Type = e.id2Type
        """
    yql += """
        );

        insert into `{soup_full}` with truncate
        select * from $soup_full;
    """

    yql = yql.format(
        tx=tx.transaction_id,
        preprocessed_pairs=preprocessed_pairs,
        soup_full=soup_full,
        bad_ids="','".join(bad_ids),
        bad_edges="','".join(bad_edges),
        insignificant_edges=insignificant_edges,
    )

    run_yql(yql)


@timed
def finalize_indevice(
    soup_full,
    soup_cc,
    collapsed_uuids,
    bad_ids,
    bad_edges,
    output_table,
    output_table_comp_sizes,
    output_bad_ids,
    output_bad_edges,
    tx,
):

    if collapsed_uuids is None:
        collapsed_uuids = "undefined"
        out_no_sizes = """
            select id, id_type, indevice_id, date_begin, date_end
            from $ids_cc
        """
    else:
        out_no_sizes = """
            select
                if(b.id1 is null, a.id, b.id1) as id,
                if(b.id1 is null, a.id_type, b.id1Type) as id_type,
                a.indevice_id as indevice_id,
                if(b.id1 is null, a.date_begin, b.date_begin) as date_begin,
                if(b.id1 is null, a.date_end, b.date_end) as date_end
            from $ids_cc as a
            left join $collapsed_uuids as b on a.id = b.id2 and a.id_type = b.id2Type
        """

    run_yql(
        """
        PRAGMA SimpleColumns;
        PRAGMA yt.ExternalTx = '{tx}';
        PRAGMA yt.InferSchema;
        PRAGMA yt.DisableJobSplitting;

        $collapsed_uuids = '{collapsed_uuids}';

        $soup_full = '{soup_full}';

        -- fetch ids from soup
        $soup_dup = (
            SELECT id1Type as id_type, id1 as id, date_begin, date_end
            FROM $soup_full
            UNION ALL
            SELECT id2Type as id_type, id2 as id, date_begin, date_end
            FROM $soup_full
        );

        $soup_ids = (
            select id_type, id,
                   (id_type || '_' || id) as v,
                   (id_type || '_' || id) as fallback_cc,
                   min(date_begin) as date_begin,
                   max(date_end) as date_end
            from $soup_dup
            group by id_type, id
        );

        -- join indevice_id from cc
        $ids_cc = (
            SELECT
                a.id_type AS id_type,
                a.id AS id,
                nvl(
                    cast(b.indevice_id as string),
                    a.fallback_cc
                ) AS indevice_id,
                a.date_begin AS date_begin,
                a.date_end AS date_end
            FROM ANY $soup_ids AS a
            LEFT JOIN ANY `{soup_cc}` AS b
            ON a.v = b.v
        );

        $out_no_sizes = (
            {out_no_sizes}
        );

        -- calculate cluster sizes and join back to ids
        $indevice_sizes = (
            SELECT
                indevice_id,
                count(*) AS size,
                min(date_begin) as cluster_date_begin,
                max(date_end) as cluster_date_end,
                Digest::Md5Hex(min_by(id_type || '_' || id, (date_begin ?? '1970-01-01') || id)) as stable_indevice_id
            FROM $out_no_sizes
            GROUP BY indevice_id
        );

        $out_with_sizes = (
            SELECT
                id,
                id_type,
                some(a.date_begin) as date_begin,
                some(a.date_end) as date_end,
                some(b.stable_indevice_id) as indevice_id,
                some(b.cluster_date_begin) as cluster_date_begin,
                some(b.cluster_date_end) as cluster_date_end,
                some(b.size) AS size,
                Ensure(null, count(*) == 1, 'each id must belong to a single indevice_id') as check_uniq
            FROM $out_no_sizes as a
            INNER JOIN ANY $indevice_sizes as b
            using (indevice_id)
            group compact by
                a.id_type as id_type,
                a.id as id
        );

        insert into `{output_table_comp_sizes}` with truncate
        select
            stable_indevice_id as indevice_id,
            size
        from $indevice_sizes
        order by size desc;

        -- output
        insert into `{output_table}` with truncate
        select
            id,
            id_type,
            date_begin,
            date_end,
            cluster_date_begin,
            cluster_date_end,
            indevice_id,
            size
        from $out_with_sizes
        order by id_type, id;

        discard select check_uniq from $out_with_sizes;

        -- find and concat bad edges and bad ids
        $bad_edges = (
            select * from EACH(AsList('{in_bad_edges}'))
        );

        insert into `{out_bad_edges}` with truncate
        select * from $bad_edges;

        $orphaned_ids = (
            select
                id as id, id_type, 'orphaned' as reason
            from $out_with_sizes
            where size = 1
        );

        insert into `{out_bad_ids}` with truncate
        select id, id_type, reason from EACH(AsList('{in_bad_ids}'))
        union all
        select id, id_type, reason from $orphaned_ids;


    """.format(
            tx=tx.transaction_id,
            soup_full=soup_full,
            soup_cc=soup_cc,
            collapsed_uuids=collapsed_uuids,
            out_no_sizes=out_no_sizes,
            in_bad_ids="','".join(bad_ids),
            in_bad_edges="','".join(bad_edges),
            output_table=output_table,
            output_table_comp_sizes=output_table_comp_sizes,
            out_bad_edges=output_bad_edges,
            out_bad_ids=output_bad_ids,
        )
    )


@timed
def build_indevice(
    full_soup_tables,
    idstorage_root,
    output_table,
    output_table_comp_sizes,
    output_bad_edges,
    output_bad_ids,
    workdir=None,
    collapse_uuids=False,
):
    if workdir is None:
        workdir = "//tmp"

    params = BuildIndeviceParams()
    for k in dir(params):
        val = locals().get(k)
        if val is not None:
            setattr(params, k, val)

    root_task = FinalizeIndevice(params)
    if config.LUIGID_URL:
        luigi.build([root_task], workers=3, scheduler_url=config.LUIGID_URL, log_level="INFO")
    else:
        luigi.build([root_task], workers=3, local_scheduler=True, log_level="INFO")

    if not root_task.complete():
        raise Exception("Luigi failed")


@timed
def make_sorted_tables(tbl, tbl_by_id, tbl_by_indevice):
    yt.run_sort(tbl, tbl_by_id, sort_by=["id_type", "id"])
    yt.run_sort(tbl, tbl_by_indevice, sort_by=["indevice_id"])


@timed
def set_output_date_attr(output_date, *tables):
    for table in tables:
        yt.set("{path}/@generate_date".format(path=table), output_date)
