pragma AnsiInForEmptyOrNullableItemsCollections;
use hahn;
pragma yson.DisableStrict;
-- pragma yt.Pool = "@[pool]";
-- pragma yt.DefaultOperationWeight = "100";
-- pragma yt.PoolTrees = "physical";
-- pragma yt.TentativePoolTrees = "cloud";
-- pragma yt.MaxJobCount = "99999";

$re_yandexuid = Re2::Match("[0-9]+1[23456][0-9]{8}");

-- $date_from = "2020-03-01";
-- $date_to = "2020-04-12";
-- $reduced1_table = "home/videoquality/vh_analytics/mma-4376/2020-03-01_2020-04-11/map";
-- $output_table = "home/videoquality/vh_analytics/mma-4376/2020-03-01_2020-04-11/map_with_mm";

$parseHeaders = ($headers) -> {
    $chunks = String::SplitToList($headers, '~');
    $pairs = ListMap($chunks, ($chunk) -> {
        $pair = String::SplitToList($chunk, ': ');
        RETURN AsTuple(String::ToLower($pair[0]), COALESCE($pair[1], ''));
    });
    
    RETURN ToDict($pairs);
};

$getStrVal = ($headers, $name) -> {
    RETURN COALESCE($parseHeaders($headers)[$name], '');
};

$getDoubleVal = ($headers, $name) -> {
    RETURN CAST($getStrVal($headers, $name) AS Double);
};

define subquery $prepare_data($date_from, $date_to) as

$unwrap = ($x) -> {
    RETURN IF(
        $x is null,
        null,
        unwrap($x)
    )
};

$cryptageo = (
    select
        yauid as yandexuid,
        CAST(geots as Int64) as `timestamp`,
        Geo::RegionByLocation(CAST(lat as Double), CAST(lon as Double)).id as region_id,
        "cryptageo" as source_log
    from range(
        `//logs/crypta-rt-geo-log/1d`, $date_from, $date_to
    )
    where Geo::RoundRegionByLocation(CAST(lat as Double), CAST(lon as Double), "country").id == 225
);

$getMbitSec = ($resource_timings) -> {
    $request_start = Yson::LookupDouble(Yson::Lookup(Yson::Lookup($resource_timings, 'jquery'), '0'), 'request_start');
    $response_end = Yson::LookupDouble(Yson::Lookup(Yson::Lookup($resource_timings, 'jquery'), '0'), 'response_end');
    $transfer_size = Yson::LookupInt64(Yson::Lookup(Yson::Lookup($resource_timings, 'jquery'), '0'), 'transfer_size');
    RETURN ($transfer_size * 8.0 / 1e6) / (($response_end - $request_start) / 1000.0)
};

$wrapIp = ($x)->(IF($x == "-" or $x == "", NULL, $x));

$ip_vhost_filter = ($ip, $vhost)->(
    $ip not in ('127.0.0.1', '::1')
    and not (
        Geo::IsYandex($ip) and $vhost like '[2a02:6b8:%'
    )
    and $vhost != 'xaccelredirect.strm.yandex.net'
    and $vhost != 'xaccelredirect'
    and $vhost not like 'production-xaccelredirect%'
    and $vhost != 'trns-manager.strm.yandex.net'
    and $vhost not like 'trns%.strm.yandex.net%'
    and $vhost not like 'trns-strm%'
    and $vhost not like 'tumbler%.strm.yandex.net'
    and $vhost not like '%.tst.strm.yandex.net'
    and $vhost not like 'mgr%.strm.yandex.net'
    and $vhost not like 'playlist-cache%'
);

$lengthLimit = 300;

$getPerfRequest = ($url) -> {
    $parsed = Url::Parse($url);
    RETURN SUBSTRING(IF(
        $parsed.Query is not null,
        $parsed.Path || "?" || $parsed.Query,
        $parsed.Path
    ), 0, $lengthLimit)
};

$isOk = ($val)->($val is not null and $val > 0 and cast($val as String) not in ("inf", "-inf", "nan"));



$strm_pre_map = (
    select
        TableName() as fielddate,
        `timestamp`,
        substring(request, 0, $lengthLimit) as request,
        Url::GetCGIParam("http://example.com" || request, "vsid") as vsid,
        remote_addr as ip,
        user_agent,
        CAST(tcpinfo_rtt as Double) / 1000.0 as rtt,
        CAST(tcpinfo_rttvar as Double) as tcpinfo_rttvar,
        CAST(tcpinfo_snd_cwnd as Double) as tcpinfo_snd_cwnd,
        CAST(tcpinfo_total_retrans as Double) as tcpinfo_total_retrans
    from range(
        `logs/strm-access-log/1d`,
        $date_from, $date_to
    )
    where status in ("200", "206")
    and $ip_vhost_filter(remote_addr, vhost)
    and Url::GetCGIParam("http://example.com" || request, "vsid") is not null
);

$strm_map = (
    select
        request,
        fielddate,
        max(vsid) as vsid,
        max(`timestamp`) as `timestamp`,
        max(user_agent) as user_agent,
        UserAgent::Parse(max(user_agent)).OSFamily as os_family,
        max(ip) as ip,
        Geo::GetIspNameByIp(max(ip)) as operator,
        max(rtt) as rtt,
        max(tcpinfo_rttvar) as tcpinfo_rttvar,
        max(tcpinfo_snd_cwnd) as tcpinfo_snd_cwnd,
        max(tcpinfo_total_retrans) as tcpinfo_total_retrans
    from $strm_pre_map
    where request is not null
    and ip is not null
    and length(vsid) > 0
    and length(vsid) <= 64
    group by request, fielddate
);

$perf_pre_map = (
    select
        TableName() as fielddate,
        $getPerfRequest(request) as request,
        cast(transferSize as Double) as transferSize,
        (
            CAST(transferSize as Double) / (
                CAST(responseEnd as Double) - cast(requestStart as Double)
            )
        ) * 0.008 as downlink
    from range(
        `logs/strm-perf-log/1d`,
        $date_from,
        $date_to
    )
    where $getPerfRequest(request) is not null
    and Url::GetCGIParam("http://example.com" || $getPerfRequest(request), "vsid") is not null
);

$perf_map = (
    select
        request,
        fielddate,
        max(transferSize) as transferSize,
        min(downlink) as downlink
    from $perf_pre_map
    where $isOk(downlink)
    group by request, fielddate
    having max(transferSize) > 500000
);

$gogol_map = (
    select
        TableName() as fielddate,
        vsid,
        yandexuid
    from range(
        `logs/strm-gogol-log/1d`, $date_from, $date_to
    )
    where service == "StreamPlayer"
    and $re_yandexuid(yandexuid)
    and vsid is not null
    and length(vsid) > 0
    and length(vsid) <= 64
);

$gogol_grouped = (
    select
        fielddate,
        vsid,
        max(yandexuid) as yandexuid
    from $gogol_map
    group by fielddate, vsid
);


$chunks = (
    select s.*, transferSize, downlink, "chunks" as source_log, yandexuid
    from $strm_map as s
    inner join $perf_map as p
    on (s.fielddate == p.fielddate and s.request == p.request)
    inner join $gogol_grouped as g
    on (s.fielddate == g.fielddate and s.vsid == g.vsid)
);

$mapped = (
    select * from $cryptageo
    union all
    select * from $chunks
);

$fielddateFormat = DateTime::Format("%Y-%m-%d");
$getFielddate = ($ts) -> {
    $ts = CAST($ts as UInt32);
    $tm = AddTimezone(DateTime::FromSeconds($ts), "Europe/Moscow");
    RETURN $fielddateFormat($tm)
};

-- select source_log, fielddate, count(*) as `count`
-- from $mapped
-- group by source_log, $getFielddate(`timestamp`) as fielddate
-- order by source_log, fielddate;

$input_type = Struct<
    'buffer_duration':Double?,
    'downlink':Double?,
    'duration':Double?,
    'ect':String?,
    'event':String?,
    'full_load':Double?,
    'html':Double?,
    'ip':String?,
    'mbit_sec':Double?,
    'operator':String?,
    'os_family':String?,
    'quality':Uint64?,
    'region_id':Int64?,
    'rtt':Double?,
    'service':String?,
    'source_log':String?,
    'tcpinfo_rttvar':Double?,
    'tcpinfo_snd_cwnd':Double?,
    'tcpinfo_total_retrans':Double?,
    'throughput':Double?,
    'timestamp':Int64?,
    'transferSize':Double?,
    'ttfb':Double?,
    'user_agent':String?,
    'vsid':String?,
    'yandexuid':String?,
>;

$output_type = Struct<
    'buffer_duration':Double?,
    'connection_type':String?,
    'downlink':Double?,
    'ect':String?,
    'fielddate':String?,
    'first_buffer_duration':Double?,
    'first_buffer_throughput':Double?,
    'full_load':Double?,
    'html':Double?,
    'ip':String?,
    'mbit_sec':Double?,
    'operator':String?,
    'os_family':String?,
    'quality':Uint64?,
    'refuse':Int64?,
    'region_id':Int64?,
    'rtt':Double?,
    'service':String?,
    'source_log':String?,
    'start':Int64?,
    'tcpinfo_rttvar':Double?,
    'tcpinfo_snd_cwnd':Double?,
    'tcpinfo_total_retrans':Double?,
    'throughput':Double?,
    'timestamp':Int64?,
    'transferSize':Double?,
    'ttfb':Double?,
    'user_agent':String?,
    'view_time':Double?,
    'yandexuid':String?,
>;

$reducer = Python::reducer_chunks(
    Callable<(String?, Stream<$input_type>)->Stream<$output_type>>,
    FileContent("mma_4376_reducer.py")
);

$reduced = (
    reduce $mapped
    presort `timestamp`
    on yandexuid
    using $reducer(TableRow())
);

-- insert into $reduced1_table WITH TRUNCATE 
-- select * from $reduced;

$desktop = select * from $reduced where os_family == "Windows";
$mobile = select * from $reduced where os_family in ("iOS", "Android");

$yandexuid_whitelist = select distinct yandexuid from $mobile;

$device_id_to_yandexuid = (
    select * from `//home/crypta/production/state/graph/v2/matching/by_id/yandexuid/direct/mm_device_id` as crypta
    left semi join $yandexuid_whitelist as wh on (crypta.id == wh.yandexuid)
);

$mobmetrika = (
    select
        "metrika" as source_log,
        crypta.id as yandexuid,
        cast(EventTimestamp as Uint64) as `timestamp`,
        ConnectionType as connection_type
    from range(
        `//logs/metrika-mobile-log/1d`, $date_from, $date_to
    ) as m
    inner join $device_id_to_yandexuid as crypta on (crypta.target_id == m.DeviceID)
    where ConnectionType in ("CONN_CELL", "CONN_WIFI")
);

$reducer2 = Python::reducer2(
    Callable<(String?, Stream<$output_type>)->Stream<$output_type>>,
    FileContent("mma_4376_reducer.py")
);

$reduced2 = (
    reduce (
        select * from $mobile
        union all
        select * from $mobmetrika
    )
    presort `timestamp`
    on yandexuid
    using $reducer2(TableRow())
);

$united = (
    select * from $desktop
    union all
    select * from $reduced2
);

-- insert into $output_table WITH TRUNCATE 
select * from $united;
end define;

export $prepare_data;