#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import division
import sys
import os
import codecs
import copy
import argparse
from collections import defaultdict
from nile.api.v1 import (
    clusters,
    filters as nf,
    extractors as ne,
    aggregators as na,
    files as nfi,
    Record,
    with_hints,
    extended_schema,
)
from qb2.api.v1 import (
    filters as qf,
    extractors as qe,
    typing as qt,
    resources as qr,
    typing as qt,
)
import datetime
import requests

# import json
# import hashlib
# import urlparse
# import re
from videolog_common import (
    date_range,
    optionalize_schema,
    get_date,
    get_cluster,
    get_driver,
    get_stat_headers,
    StatPusher,
)
from strm_cube_2_common import (
    preprocessed_schema,
    StrmParser,
    JsTracerParser,
    RedirParser,
    DspParser,
    RtbParser,
    microsessions_reducer,
    add_session_markers,
    microsessions_schema,
    get_hash,
)

cluster = None
IRON_BRANCH_TABLE = "//home/videolog/strm_meta/iron_branch/concat"
PAGE_IMP_TABLE = "//home/videolog/strm_meta/page_imp"
DSP_ID_TABLE = "//home/videolog/strm_meta/dsp_id"
ASNAME_TABLE = "//home/videolog/AS_MAP/proper_AS_names_corrected"
CRYPTA_TABLE = "//home/crypta/production/profiles/export/profiles_for_14days"
EFIR_HISTORY_TABLE = "//home/videolog/24julia/mma-2177/1.efir_history"
USER_LICENSES_TABLE = "//home/videolog/strm_meta/yandexuids_subscriptions"
SLA_REPORT = "Video/Others/Strm/strm_cube_2_SLA"
nfi_common_small = [
    nfi.LocalFile("strm_cube_2_common.py"),
    nfi.LocalFile("videolog_common.py"),
]
nfi_common = nfi_common_small + [
    nfi.StatboxDict("Geobasev6.bin"),
    nfi.StatboxDict("IpOriginV6.xml"),
]
default_jobroot = "//home/videoquality/vh_analytics/strm_cube_2"


def common_pipeline(
    cluster,
    job,
    table,
    parser,
    asname_dict,
    intensity="large_data",
    memory_limit=5000,
    optional_kwargs=None,
):
    if not optional_kwargs:
        optional_kwargs = {}
    return job.table(table).map(
        with_hints(output_schema=preprocessed_schema)(
            parser(asname_dict, **optional_kwargs)
        ),
        files=nfi_common,
        memory_limit=memory_limit,
        intensity=intensity,
    )


def dsp_pipeline(job, dsp_table):
    page_ids = {
        str(x["PageID"]) for x in cluster.driver.yt_driver.read(PAGE_IMP_TABLE)
    }

    return (
        job.table(dsp_table)
        .map(
            with_hints(output_schema=preprocessed_schema)(DspParser(page_ids)),
            files=nfi_common_small,
            intensity="large_data",
        )
        .project(
            "os_family",
            "browser_name",
            "timestamp",
            "yu_hash",
            "yandexuid",
            "page_id",
            "event",
            "source",
            "add_info",
        )
    )


def merge_dicts(*args):
    if not args:
        return {}
    a = args[0] or {}
    for d in args[1:]:
        a.update(d or {})
    return a


def try_get_content_type_id(chain):
    try:
        return int(chain[-1]["ContentTypeID"])
    except (ValueError, TypeError, AttributeError, IndexError):
        return -1


def check_rtb_dsp(a, c):
    dspid = a["DspID"]
    producttype = a["producttype"]
    video_type = c["video_type"]
    return video_type != "in_app" or (
        video_type == "in_app"
        and (dspid != 1 or producttype == "auto-video-direct")
    )


def rtb_dsp_pipeline(cluster, job, dsp_table, asname_dict, rtb_table=None):
    page_ids = {
        str(x["PageID"]): (
            x.get("not_vh_partner")
            if x.get("not_vh_partner") == "yandex.ru"
            else x.get("vh_partner")
        )
        for x in cluster.driver.yt_driver.read(PAGE_IMP_TABLE)
        if (
            x.get("vh_partner", "") != "not_vh"
            or x.get("not_vh_partner") == "yandex.ru"
        )
    }

    dsp_ids = {x["DSPID"] for x in cluster.driver.yt_driver.read(DSP_ID_TABLE)}

    dsp_fields_common = [
        "os_family",
        "browser_name",
        "timestamp",
        "yu_hash",
        "yandexuid",
        "page_id",
        "event",
        "source",
        "imp_id",
    ]
    rtb_fields_common = [
        "vsid",
        "video_content_id",
        "user_agent",
        "browser_version",
        "device_type",
        "region",
        "country",
        "ip",
        "a_station",
        "provider",
    ]

    dsp_schema = copy.deepcopy(preprocessed_schema)
    dsp_schema.pop("add_info")
    dsp_schema["add_info_a"] = qt.Optional[qt.Json]
    dsp_schema["imp_id"] = qt.Optional[qt.String]

    if rtb_table:

        dsp_fields = ["bidreqid", "add_info_a"] + dsp_fields_common

        dsp = (
            job.table(dsp_table)
            .map(
                with_hints(output_schema=dsp_schema)(
                    DspParser(page_ids, dsp_ids, add_info="add_info_a")
                ),
                files=nfi_common_small,
                intensity="large_data",
            )
            .project(dsp_fields)
        )

        rtb_schema = copy.deepcopy(preprocessed_schema)
        rtb_schema.pop("add_info")
        rtb_schema["add_info_b"] = qt.Optional[qt.Json]

        rtb_fields = ["bidreqid", "add_info_b"] + rtb_fields_common

        rtb = (
            job.table(rtb_table)
            .map(
                with_hints(output_schema=rtb_schema)(RtbParser(asname_dict)),
                files=nfi_common,
                intensity="large_data",
            )
            .unique("bidreqid")
            .project(rtb_fields)
        )

        rtb_dsp = dsp.join(rtb, type="left", by="bidreqid")
    else:
        dsp_schema["add_info_b"] = qt.Optional[qt.Json]
        rtb_dsp_fields = (
            ["bidreqid", "add_info_a", "add_info_b"]
            + dsp_fields_common
            + rtb_fields_common
        )

        rtb_dsp = (
            job.table(dsp_table)
            .map(
                with_hints(output_schema=dsp_schema)(
                    DspParser(
                        page_ids,
                        dsp_ids,
                        add_info="add_info_a",
                        process_rtb=True,
                        asname_dict=asname_dict,
                    )
                ),
                files=nfi_common,
                intensity="large_data",
            )
            .project(rtb_dsp_fields)
        )

    page_imp = job.table(PAGE_IMP_TABLE).project(
        category_id=ne.custom(str, "category_id").add_hints(
            type=qt.Optional[qt.String]
        ),
        imp_id=ne.custom(str, "ImpID").add_hints(type=qt.Optional[qt.String]),
        page_id=ne.custom(str, "PageID").add_hints(
            type=qt.Optional[qt.String]
        ),
        add_info_c=ne.custom(
            lambda ad_type, video_type, not_vh_partner, vh_partner: {
                "ad_type": ad_type,
                "video_type": video_type,
                "not_vh_partner": not_vh_partner,
                "vh_partner": vh_partner,
            },
            "type",
            "video_type",
            "not_vh_partner",
            "vh_partner",
        ).add_hints(type=qt.Optional[qt.Json]),
    )

    return (
        rtb_dsp.join(page_imp, type="inner", by=["page_id", "imp_id"])
        .filter(nf.custom(check_rtb_dsp, "add_info_a", "add_info_c"))
        .project(
            ne.all(
                exclude=[
                    "add_info_a",
                    "add_info_b",
                    "add_info_c",
                    "video_content_id",
                ]
            ),
            video_content_id=ne.custom(
                lambda x: x if x else "novcid", "video_content_id"
            ).add_hints(type=qt.Optional[qt.String]),
            add_info=ne.custom(
                merge_dicts, "add_info_a", "add_info_b", "add_info_c"
            ).add_hints(type=qt.Optional[qt.Json]),
        )
    )


def get_expboxes(headers):
    try:
        parsed = headers.split("~")
    except AttributeError:
        return
    try:
        dict_ = dict(tuple(x.split(": ", 1)) for x in parsed)
    except ValueError:
        return
    return dict_.get("X-Yandex-ExpBoxes")


def get_expboxes_news(httpHeaders):
    httpHeaders = httpHeaders or []
    try:
        dict_ = {x["name"]: x["value"] for x in httpHeaders}
    except (KeyError, AttributeError, ValueError):
        return
    return dict_.get("x-yandex-expboxes")


@with_hints(output_schema=extended_schema())
def test_buckets_reduce(groups):
    for key, recs in groups:
        test_buckets_result = defaultdict(set)
        for rec in recs:
            tb = rec.get("test_buckets")
            if not tb:
                continue
            for bucket in tb.split(";"):
                testid = bucket.split(",")[0]
                test_buckets_result[testid].add(bucket)
        result = {test_buckets_result[k].pop() for k in test_buckets_result}
        yield Record(key, test_buckets=";".join(sorted(result)))


def test_buckets_pipeline(
    job, access_log_table, morda_access_log_table, news_access_log_table
):
    access_log = job.table(access_log_table).qb2(
        log="generic-log",
        fields=[
            qe.log_field("headers").hide().with_type(str),
            qe.log_field("raw_yandexuid").rename("yandexuid").with_type(str),
            qe.custom("test_buckets", get_expboxes, "headers").with_type(
                qt.Optional[qt.String]
            ),
        ],
        filters=[
            qf.defined("yandexuid"),
            qf.not_(qf.equals("yandexuid", "-")),
            qf.defined("test_buckets"),
        ],
    )
    morda_access_log = job.table(morda_access_log_table).qb2(
        log="generic-log",
        fields=[
            qe.log_field("raw_yandexuid").rename("yandexuid").with_type(str),
            qe.log_field("test-bucket").rename("test_buckets").with_type(str),
        ],
        filters=[
            qf.defined("yandexuid"),
            qf.not_(qf.equals("yandexuid", "-")),
            qf.defined("test_buckets"),
        ],
    )
    news_access_log = job.table(news_access_log_table).qb2(
        log="generic-log",
        fields=[
            qe.log_field("vhost").hide().with_type(str),
            qe.log_field("yandexUid").rename("yandexuid").with_type(str),
            qe.log_field("httpHeaders").hide().with_type(qt.Optional[qt.Json]),
            qe.custom(
                "test_buckets", get_expboxes_news, "httpHeaders"
            ).with_type(str),
        ],
        filters=[
            qf.defined("yandexuid"),
            qf.not_(qf.equals("yandexuid", "-")),
            qf.defined("test_buckets"),
        ],
    )
    return (
        job.concat(access_log, morda_access_log, news_access_log)
        .groupby("yandexuid")
        .reduce(test_buckets_reduce)
    )


def try_get_from_parent(chain, field, t="str"):
    for element in chain[-2::-1]:
        try:
            return element[field]
        except (IndexError, TypeError, KeyError):
            if t == "str":
                return ""
            else:
                return 0
    if t == "str":
        return ""
    else:
        return 0


def get_view_type(timetuple, timestamp, content_type_id, heur_category):
    if heur_category == "vod":
        return "vod"
    timetuple = timetuple or [None, None]
    if timetuple[0] <= timestamp <= timetuple[1]:
        return "live"
    if not timetuple[0] and content_type_id == 2:
        return "live"
    return "dvr"


class EfirHistoryReducer(object):
    def __init__(self, date):
        self.threshold = get_date(date) - datetime.timedelta(days=30)

    def __call__(self, groups):
        for key, recs in groups:
            last_month_set = set()
            first_visit = None
            for rec in recs:
                dates = rec.info.keys()
                for date_ in dates:
                    date = get_date(date_)
                    if not first_visit or date < first_visit:
                        first_visit = date
                    if date >= self.threshold:
                        last_month_set.add(date)
            yield Record(
                yu_hash=key.yandexuid,
                efir_first_visit=(str(first_visit) if first_visit else None),
                efir_last_month_active_days=len(last_month_set),
            )


def good_vcid(s):
    return bool(s) and s != "novcid"


class GetFirstVisit(object):
    EFIR_REFFROMS = {
        "videohub",
        "efir",
        "streamhandler_other",
        "morda",
        "videohub_touch",
        "efir_touch",
        "streamhandler_appsearch",
        "morda_touch",
    }

    def __init__(self, date_f):
        self.date_f = date_f

    def __call__(self, efir_first_visit, ref_from):
        if efir_first_visit:
            return efir_first_visit
        elif ref_from in self.EFIR_REFFROMS:
            return self.date_f


def select_subscription(licenses_dates):
    return sorted(
        licenses_dates, key=lambda x: max(x.dates)
    )[-1].user_subscription


def process_date(date, args):
    date_f = format(date)
    date_rev = date.strftime("%d-%m-%Y")
    strm_table = "//logs/strm-access-log/1d/{}".format(date)
    redir_table = "//logs/redir-log/1d/{}".format(date)
    js_tracer_table = "//logs/jstracer-log/1d/{}".format(date)
    js_tracer_table_old = "//home/js_tracer/day_by_day/{}".format(date_rev)
    # rtb_table = '//logs/bs-rtb-log/1d/{}'.format(date)
    # dsp_table = '//logs/bs-dsp-log/1d/{}'.format(date)
    rtb_dsp_cooked_table = "//statbox/cooked_logs/bs-dsp-cooked-log/v1/1d/{}".format(
        date
    )
    access_log_table = "//logs/yandex-access-log/1d/{}".format(date)
    morda_access_log_table = "//logs/morda-access-log/1d/{}".format(date)
    news_access_log_table = "//logs/news-scarab-access-log/1d/{}".format(date)
    driver = get_driver(cluster)
    client = driver.client

    root = "{}/{}".format(args.job_root, date_f)
    preprocessed = "{}/preprocessed".format(root)
    sessions = "{}/sessions".format(root)

    asname_dict = {rec.ASN: rec.ISP for rec in driver.read(ASNAME_TABLE)}

    redo_date = None
    try:
        redo_date = get_date(args.redo)
    except:
        pass

    try:
        mtimedate = get_date(
            driver.get_attribute(preprocessed, "modification_time")
        )
    except:
        mtimedate = None

    job = cluster.job()

    if driver.exists(preprocessed) and (
        not args.redo or redo_date and mtimedate and redo_date <= mtimedate
    ):
        print("using existing preprocessed table")
        stream = job.table(preprocessed)
    else:
        to_concat = []
        if driver.exists(strm_table):
            to_concat.append(
                common_pipeline(
                    cluster,
                    job,
                    strm_table,
                    StrmParser,
                    asname_dict,
                    memory_limit=5000,
                    intensity="data",
                )
            )
        else:
            raise Exception("no strm table")
        if driver.exists(js_tracer_table):
            to_concat.append(
                common_pipeline(
                    cluster,
                    job,
                    js_tracer_table,
                    JsTracerParser,
                    asname_dict,
                    optional_kwargs={"new": True},
                )
            )
        elif driver.exists(js_tracer_table_old):
            to_concat.append(
                common_pipeline(
                    cluster,
                    job,
                    js_tracer_table_old,
                    JsTracerParser,
                    asname_dict,
                )
            )
        else:
            raise Exception("no js-tracer table")
        if driver.exists(redir_table) and date >= datetime.date(2019, 3, 25):
            to_concat.append(
                common_pipeline(
                    cluster, job, redir_table, RedirParser, asname_dict
                )
            )
        else:
            raise Exception("no redir table")
        if driver.exists(rtb_dsp_cooked_table):
            print("using rtb-dsp cooked log")
            to_concat.append(
                rtb_dsp_pipeline(
                    cluster,
                    job,
                    rtb_dsp_cooked_table,
                    asname_dict,
                    rtb_table=None,
                )
            )
        else:
            raise Exception("no rtb-dsp-cooked table")

        stream = (
            job.concat(*to_concat).sort("vsid", "timestamp").put(preprocessed)
        )

    job.run()

    client.set_attribute(
        preprocessed,
        "build_time",
        datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S"),
    )

    if args.no_sessions:
        return

    for alt in [
        access_log_table,
        morda_access_log_table,
        news_access_log_table,
    ]:
        if not driver.exists(alt):
            raise Exception("no access-log table ({})".format(alt))

    job = cluster.job()
    iron_branch = job.table(IRON_BRANCH_TABLE).project(
        "heur_category",
        "license",
        content_duration="duration",
        content_type_id=ne.custom(try_get_content_type_id, "chain").add_hints(
            type=qt.Integer
        ),
        video_content_id="JoinKey",
        channel="computed_channel",
        program="computed_program",
        timetuple=ne.custom(
            lambda x, y: (x, y), "start_time", "finish_time"
        ).add_hints(type=qt.Json),
        UUID="UUID",
        ContentTypeID=ne.custom(
            lambda x: try_get_from_parent(x, "ContentTypeID", t="int"), "chain"
        ).add_hints(type=int),
        ParentUUID=ne.custom(
            lambda x: try_get_from_parent(x, "UUID"), "chain"
        ).add_hints(type=str),
        ParentTypeID=ne.custom(
            lambda x: try_get_from_parent(x, "ContentTypeID", t="int"), "chain"
        ).add_hints(type=int),
    )

    test_buckets = test_buckets_pipeline(
        job, access_log_table, morda_access_log_table, news_access_log_table
    )

    crypta = (
        job.table(CRYPTA_TABLE)
        .project(
            "age_segments",
            "user_age_6s",
            "gender",
            "income_segments",
            "exact_socdem",
            yandexuid_from_crypta=ne.custom(str, "yandexuid").add_hints(
                type=str
            ),
            yu_hash=ne.custom(
                lambda x: get_hash(str(x)), "yandexuid"
            ).add_hints(type=str),
            files=nfi_common_small,
        )
        .unique("yu_hash")
    )

    stream = (
        job.table(preprocessed)
        .filter(nf.custom(bool, "vsid"))
        .groupby("vsid")
        .sort("timestamp", "video_content_id")
        .reduce(
            with_hints(
                output_schema=optionalize_schema(qt, microsessions_schema)
            )(microsessions_reducer),
            files=nfi_common_small,
            memory_limit=10000,
            intensity="data",
        )
    )

    stream_with_vcid = stream.filter(nf.custom(good_vcid, "video_content_id"))

    stream_without_vcid = stream.filter(
        nf.custom(lambda x: not good_vcid(x), "video_content_id")
    )

    stream_with_vcid = stream_with_vcid.join(
        iron_branch,
        by="video_content_id",
        type="left",
        assume_unique_right=True,
    )

    stream = job.concat(stream_with_vcid, stream_without_vcid)

    efir_history = (
        job.table(EFIR_HISTORY_TABLE)
        .groupby("yandexuid")
        .reduce(
            with_hints(
                output_schema=dict(
                    yu_hash=qt.Optional[qt.String],
                    efir_first_visit=qt.Optional[qt.String],
                    efir_last_month_active_days=int,
                )
            )(EfirHistoryReducer(date_f)),
            files=nfi_common_small,
        )
    )

    yu_hash_filter = qf.and_(
        qf.nonzero("yu_hash"), qf.not_(qf.equals("yu_hash", "-"))
    )

    stream_with_yu_hash = stream.filter(yu_hash_filter)

    stream_without_yu_hash = stream.filter(qf.not_(yu_hash_filter))

    stream_with_yu_hash = stream_with_yu_hash.project(
        ne.all(exclude=["video_content_id"]),
        video_content_id=ne.custom(
            lambda x: x if x else "novcid", "video_content_id"
        ).add_hints(type=qt.Optional[qt.String]),
    )

    stream_with_yu_hash = (
        stream_with_yu_hash.join(
            crypta, by="yu_hash", type="left", assume_unique_right=True
        )
        .join(
            efir_history, type="left", by="yu_hash", assume_unique_right=True
        )
        .project(
            ne.all(
                exclude=["efir_first_visit", "efir_last_month_active_days"]
            ),
            efir_first_visit=ne.custom(
                GetFirstVisit(date_f), "efir_first_visit", "ref_from"
            ).with_type(qt.Optional[qt.String]),
            efir_last_month_active_days=ne.custom(
                lambda x: x if x else 0, "efir_last_month_active_days"
            ).with_type(int),
        )
    )

    stream = job.concat(stream_with_yu_hash, stream_without_yu_hash)

    stream = stream.project(
        ne.all(exclude=["yandexuid", "yandexuid_from_crypta"]),
        yandexuid=ne.custom(
            lambda x, y: (x if x else y), "yandexuid", "yandexuid_from_crypta"
        ).add_hints(type=str),
        view_type=ne.custom(
            get_view_type,
            "timetuple",
            "timestamp",
            "content_type_id",
            "heur_category",
        ).add_hints(type=str),
    )

    no_yandexuid_filter = qf.or_(
        qf.equals("yandexuid", "-"),
        qf.equals(
            "user_id", "85d04a3258cafef9af33cab567746785"
        ),  # undefined quick fix
    )

    stream_without_yu = stream.filter(no_yandexuid_filter).project(
        ne.all(),
        is_view_old=ne.custom(
            lambda x: int(x["durations"].get("chunks", 0) > 0), "add_info"
        ).with_type(int),
        is_view_new=ne.custom(
            lambda v, p, pp: int(
                (v or 0) > 0 or (p or 0) > 0 or (pp or 0) > 0
            ),
            "view_time",
            "price",
            "partner_price",
        ).with_type(int),
        new_ad_session_crutch=ne.const(1),
        new_user_session=ne.const(1),
        fielddate=ne.const(date_f),
    )

    user_licenses = (
        job.table(USER_LICENSES_TABLE)
        .filter(
            nf.custom(
                lambda x: (datetime.date(2019, 9, 10) - get_date(x)).days <= 7,
                "max_date",
            ), files=nfi_common_small
        )
        .project(
            "yandexuid",
            user_license=ne.custom(
                select_subscription, "licenses_dates"
            ).with_type(qt.Optional[qt.String]),
            files=nfi_common_small
        )
    )

    stream_with_yu = (
        stream.filter(qf.not_(no_yandexuid_filter))
        .join(
            test_buckets, type="left", by="yandexuid", assume_unique_right=True
        )
        .join(
            user_licenses,
            type="left",
            by="yandexuid",
            assume_unique_right=True,
        )
        .groupby("user_id")
        .sort("timestamp")
        .reduce(
            with_hints(
                output_schema=extended_schema(
                    is_view_old=int,
                    is_view_new=int,
                    new_ad_session_crutch=int,
                    new_user_session=int,
                )
            )(add_session_markers),
            files=nfi_common,
            memory_limit=2000,  # intensity='large_data'
        )
        .project(ne.all(), fielddate=ne.const(date_f))
    )

    job.concat(stream_with_yu, stream_without_yu).put(
        sessions  # schema=optionalize_schema(qt, microsessions_schema)
    )

    job.run()

    sessions_built = datetime.datetime.now()
    client.set_attribute(
        sessions, "build_time", sessions_built.strftime("%Y-%m-%dT%H:%M:%S")
    )

    if args.job_root != default_jobroot:
        print("job root is not default, so skipping sla check & push")
        return
    headers = get_stat_headers()
    values = requests.get(
        "https://upload.stat.yandex-team.ru/_api/statreport/json/{}/?scale=d&sla_type=sessions".format(
            SLA_REPORT
        ),
        verify=False,
        headers=headers,
    ).json()["values"]
    fielddates = {x["fielddate"].split(" ")[0] for x in values}
    if date_f not in fielddates:
        data = [
            {
                "fielddate": date_f,
                "sla_type": "sessions",
                "time": int(sessions_built.strftime("%H%M")),
            }
        ]
        sp = StatPusher(cluster, report=SLA_REPORT)
        sp.push(data)


def main():
    global cluster
    parser = argparse.ArgumentParser()
    parser.add_argument("--from")
    parser.add_argument("--to")
    parser.add_argument("--pool", default="videolog")
    parser.add_argument("--redo")
    parser.add_argument("--pool_trees", default="physical")
    parser.add_argument("--tentative_pool_trees", default="cloud")
    parser.add_argument("--weight", type=int, default=10)
    parser.add_argument("--no_redir", action="store_true")
    parser.add_argument("--no_sessions", action="store_true")
    parser.add_argument("--rtb_dsp", action="store_true")
    parser.add_argument("--job_root", default=default_jobroot)
    args = parser.parse_args()

    from_ = getattr(args, "from")
    to_ = getattr(args, "to")
    job_root = args.job_root

    kwargs = dict(
        token=os.environ["YT_TOKEN"],
        yql_token=os.environ["YQL_TOKEN"],
        pool=args.pool,
        templates=dict(
            tmp_root=job_root.replace("strm_cube_2", "tmp"),
            title="STRM Cube v2",
        ),
        yt_spec_defaults=dict(
            pool_trees=args.pool_trees.split(","),
            tentative_pool_trees=args.tentative_pool_trees.split(","),
            weight=args.weight,
        ),
    )

    cluster = get_cluster(clusters, kwargs)
    driver = get_driver(cluster)
    client = driver.client

    if from_ and to_:
        dates_to_process = date_range(from_, to_)
    else:
        processed_dates = sorted(
            get_date(s)
            for s in client.search(
                root=job_root,
                node_type="table",
                path_filter=lambda x: (x or "").endswith("sessions"),
            )
            if get_date(s)
        )

        print("last date: {}".format(processed_dates[-1]))

        available_dates = sorted(
            get_date(s)
            for s in client.search(
                root="//logs/strm-access-log/1d", node_type="table"
            )
            if get_date(s)
        )

        print("last available date: {}".format(available_dates[-1]))

        dates_to_process = [
            x for x in available_dates if x > processed_dates[-1]
        ]

    for date in dates_to_process:
        print("processing {}...".format(date))
        process_date(date, args)


if __name__ == "__main__":
    main()
