use hahn;

$re_yandexuid = Re2::Match("[0-9]+1[23456][0-9]{8}");

$parseNewsHeaders = ($yson) -> {
    $headers = Yson::ConvertToList($yson);
    RETURN ToDict(
        ListMap($headers, ($x) -> {
            RETURN AsTuple(
                unwrap(Yson::LookupString($x, "name")),
                unwrap(Yson::LookupString($x, "value")),
            )
        })
    )
};

$parseHeaders = ($s) -> {
    $split = String::SplitToList($s, "~");
    $FurtherSplit = ListMap(
        $split, ($x) -> {
            $sp = String::SplitToList($x, ": ", 1 as Limit);
            RETURN AsTuple(unwrap($sp[0]), $sp[1])
        }
    );
    RETURN ToDict($FurtherSplit)
};

$parseLocation = ($location) -> {
    $sp = String::SplitToList($location, ", ");
    $lat = CAST($sp[0] as Double);
    $lon = CAST($sp[1] as Double);
    $precision = CAST($sp[2] as UInt64);
    $timestamp = CAST($sp[3] as UInt64);
    RETURN AsStruct(
        $lat as lat,
        $lon as lon,
        $precision as precision,
        $timestamp as `timestamp`
    )
};

define subquery $get_yandexuid_whitelist($sessions) as
select distinct yandexuid from $sessions() where $re_yandexuid(yandexuid);
end define;

define subquery $get_device_id_to_yandexuid($sessions) as
$yandexuid_whitelist = select * from $get_yandexuid_whitelist($sessions);
select * from `//home/crypta/production/state/graph/v2/matching/by_id/yandexuid/direct/mm_device_id` as crypta
left semi join $yandexuid_whitelist as wh on (crypta.id == wh.yandexuid);
end define;

DEFINE SUBQUERY $get_location_data($date_from, $date_to, $sessions) AS

$yandexuid_whitelist = select * from $get_yandexuid_whitelist($sessions);
$device_id_to_yandexuid = select * from $get_device_id_to_yandexuid($sessions);

$parseYp = ($yp) -> {
    $sp = String::SplitToList($yp, "#");
    $lst = ListFlatMap($sp, ($x) -> {
        $ls = String::SplitToList($x, "."); 
        RETURN IF(
            $ls[1] is not null and $ls[2] is not null,
            AsTuple(unwrap($ls[1]), unwrap($ls[2])),
            null
        )
    });
    RETURN ToDict($lst)
};

$parseGpauto = ($gpauto) -> {
    $lst = String::SplitToList($gpauto, "%3A");
    $lat = CAST(String::ReplaceAll($lst[0], "_", ".") as Double);
    $lon = CAST(String::ReplaceAll($lst[1], "_", ".") as Double);
    $precision = CAST(String::ReplaceAll($lst[2], "_", ".") as Double);
    $loc_timestamp = CAST(String::ReplaceAll($lst[4], "_", ".") as UInt64);
    RETURN AsStruct(
        $lat as lat,
        $lon as lon,
        $precision as precision,
        $loc_timestamp as `timestamp`
    )
};

$parseGpautoWrapper = ($cookie) -> {
    $cookies = Dsv::Parse($cookie, "; ");
    $yp = $cookies["yp"];
    $yp_parsed = $parseYp($yp);
    $gpauto = $yp_parsed["gpauto"];
    $gpauto_parsed = $parseGpauto($gpauto);
    RETURN IF(
        $yp is not null
        and $gpauto is not null
        and $gpauto_parsed is not null
        and $gpauto_parsed.precision <= 1000
        and $gpauto_parsed.lat is not null
        and $gpauto_parsed.lon is not null
        and $gpauto_parsed.`timestamp` is not null,
        $gpauto_parsed,
        null
    )
};

$gogol_gpauto = (
    select
        unwrap(_other["yandexuid"]) as yandexuid,
        $parseGpautoWrapper(_other["cookie"]).lat as lat,
        $parseGpautoWrapper(_other["cookie"]).lon as lon,
        $parseGpautoWrapper(_other["cookie"]).`timestamp` as `timestamp`,
    from range(
        `logs/strm-gogol-log/1d`, $date_from, $date_to
    )
    where $parseGpautoWrapper(_other["cookie"]) is not null
    and _other["yandexuid"] is not null
    and $parseGpautoWrapper(_other["cookie"]).lat is not null
    and $parseGpautoWrapper(_other["cookie"]).lon is not null
    and $parseGpautoWrapper(_other["cookie"]).`timestamp` is not null
);

$news = (
select
    $parseLocation($parseNewsHeaders(httpHeaders)["x-region-location"]).lat as lat,
    $parseLocation($parseNewsHeaders(httpHeaders)["x-region-location"]).lat as lon,
    $parseLocation($parseNewsHeaders(httpHeaders)["x-region-location"]).`timestamp` as `timestamp`,
    unwrap(yandexUid) as yandexuid
from range(
    `//logs/news-scarab-access-log/1d`, $date_from, $date_to
)
where $parseLocation($parseNewsHeaders(httpHeaders)["x-region-location"]).precision <= 1000
and $re_yandexuid(yandexUid)
and $parseLocation($parseNewsHeaders(httpHeaders)["x-region-location"]).lat is not null
and $parseLocation($parseNewsHeaders(httpHeaders)["x-region-location"]).lon is not null
and $parseLocation($parseNewsHeaders(httpHeaders)["x-region-location"]).`timestamp` is not null
);

$news_joined = (
    select * from $news as l
    left semi join $yandexuid_whitelist as wh using (yandexuid)
);

$access = (
select
    $parseLocation(unwrap($parseHeaders(headers)["X-Region-Location"])).lat as lat,
    $parseLocation(unwrap($parseHeaders(headers)["X-Region-Location"])).lon as lon,
    $parseLocation(unwrap($parseHeaders(headers)["X-Region-Location"])).`timestamp` as `timestamp`,
    unwrap(raw_yandexuid) as yandexuid
from range(
    `//logs/yandex-access-log/1d`, $date_from, $date_to
)
where FIND(headers, "X-Region-Location") IS NOT NULL
AND $re_yandexuid(raw_yandexuid)
AND $parseLocation(
        unwrap($parseHeaders(headers)["X-Region-Location"])
    ).precision <= 1000
AND $parseLocation(unwrap($parseHeaders(headers)["X-Region-Location"])).lat IS NOT NULL
AND $parseLocation(unwrap($parseHeaders(headers)["X-Region-Location"])).lon IS NOT NULL
AND $parseLocation(unwrap($parseHeaders(headers)["X-Region-Location"])).`timestamp` IS NOT NULL
);

$access_joined = (
    select * from $access as l
    left semi join $yandexuid_whitelist as wh using (yandexuid)
);

$laas_with_yu = (
    select
        yandexuid,
        `x-region-location-lat` as lat,
        `x-region-location-lon` as lon,
        CAST(`unixtime` as UInt64) as `timestamp`
    from range(
        `//logs/laas-log/1d`, $date_from, $date_to
    )
    where yandexuid != "0" and `x-region-location-accuracy` <= 1000
    and `x-region-location-lat` is not null
    and `x-region-location-lon` is not null
    and CAST(`unixtime` as UInt64) is not null
);

$laas_with_yu_joined = (
    select * from $laas_with_yu as l
    left semi join $yandexuid_whitelist as wh using (yandexuid)
);

/*--[mobmetrika]
$mobmetrika = (
    select
        crypta.id as yandexuid,
        cast(Latitude as Double) as lat,
        cast(Longitude as Double) as lon,
        cast(LocationTimestamp as UInt64) ?? cast(EventTimestamp as UInt64) as `timestamp`
    from range(
        `//logs/metrika-mobile-log/1d`, $date_from, $date_to
    ) as m
    inner join $device_id_to_yandexuid as crypta on (crypta.target_id == m.DeviceID)
    where CAST(LocationPrecision as UInt64) <= 1000
    and cast(Latitude as Double) is not null
    and cast(Longitude as Double) is not null
    and cast(LocationTimestamp as UInt64) ?? cast(EventTimestamp as UInt64) is not null
);
--[mobmetrika]*/

/*--[orgvisits]
$orgvisits = (
    select
        crypta.id as yandexuid,
        lat,
        lon,
        `timestamp`
    from range(
        `//home/user_identification/orgvisits/prod/state/geologs/current`, $date_from, $date_to
    ) as o
    inner join $device_id_to_yandexuid as crypta on (crypta.target_id == o.mmetric_devid)
    where precision <= 1000
    and lat is not null
    and lon is not null
    and `timestamp` is not null
);
--[orgvisits]*/

$tmp = (
    select * from $news_joined
    union all
    select * from $access_joined
    union all
    select * from $laas_with_yu_joined
    union all
    select * from $gogol_gpauto
    /*--[mobmetrika]
    union all
    select * from $mobmetrika
    --[mobmetrika]*/
    /*--[orgvisits]
    union all
    select * from $orgvisits
    --[orgvisits]*/
);

$py_reducer = @@

import datetime

def process_value(value):
    if isinstance(value, bytes):
        return value.decode("utf8", errors="replace")
    if isinstance(value, dict) or isinstance(value, list):
        return yson.dumps(value)
    return value

def make_record(key, ts_min, ts_max, lat, lon):
    return {
        "yandexuid": process_value(key),
        "lat": lat,
        "lon": lon,
        "ts_min": ts_min,
        "ts_max": ts_max,
        "fielddate_tsmax": str(datetime.datetime.fromtimestamp(int(ts_max)).date())
    }

def reducer(key, recs):
    ts_min = None
    ts_max = None
    lat = None
    lon = None
    for rec in recs:
        if not ts_min:
            ts_min = rec.timestamp
        if not lat:
            lat = rec.lat
        if not lon:
            lon = rec.lon
        if rec.lon == lon and rec.lat == lat:
            ts_max = rec.timestamp
        else:
            if not ts_max:
                ts_max = ts_min
            try:
                rec_ = make_record(key, ts_min, ts_max, lat, lon)
                yield rec_
            except ValueError:
                pass
            lat = rec.lat
            lon = rec.lon
            ts_min = rec.timestamp
            ts_max = rec.timestamp
    if not ts_max:
        ts_max = ts_min
    try:
        rec_ = make_record(key, ts_min, ts_max, lat, lon)
        yield rec_
    except ValueError:
        pass

@@;

$reducer_1 = Python::reducer(
Callable<(String?,
Stream<Struct<
'lat':Double?,
'lon':Double?,
'timestamp':UInt64?,
'yandexuid':String?
>>)->Stream<Struct<
'lat':Double?,
'lon':Double?,
'ts_min':UInt64?,
'ts_max':UInt64?,
'fielddate_tsmax':String?,
'yandexuid':String?
>>>, $py_reducer);

$preprocessed = (
    reduce $tmp
    presort `timestamp`
    on yandexuid
    using $reducer_1(TableRow())
);


$reducer_2 = Python::reducer(
Callable<(String?,
Stream<Struct<
'lat':Double?,
'lon':Double?,
'ts_min':UInt64?,
'ts_max':UInt64?,
'fielddate_tsmax':String?,
'yandexuid':String?
>>)->Stream<Struct<
'round_hour':UInt64?,
'lat':Double?,
'lon':Double?,
'ts_min':UInt64?,
'ts_max':UInt64?,
'fielddate_tsmax':String?,
'yandexuid':String?,
's2_p13':UInt64?,
's2_p14':UInt64?,
's2_p15':UInt64?
>>>, FileContent("sphere.py"));

$reduced_2 = (
    reduce $preprocessed
    presort `ts_min`
    on yandexuid
    using $reducer_2(TableRow())
);

select * from $reduced_2;
END DEFINE;

define subquery $get_technology_data($date_from, $date_to, $sessions) as
$device_id_to_yandexuid = select * from $get_device_id_to_yandexuid($sessions);

$mobmetrika = (
    select
        crypta.id as yandexuid,
        cast(EventTimestamp as UInt64) as `timestamp`,
        NetworkType as technology
    from range(
        `//logs/metrika-mobile-log/1d`, $date_from, $date_to
    ) as m
    inner join $device_id_to_yandexuid as crypta on (crypta.target_id == m.DeviceID)
    where
    cast(EventTimestamp as UInt64) is not null
    and ConnectionType == "CONN_CELL"
    and NetworkType is not null
);

$in_type = Struct<
'yandexuid':String?,
'timestamp':UInt64?,
'technology':String?
>;

$out_type = Struct<
'yandexuid':String?,
'round_hour':UInt64?,
'technology':String?,
'ts_min':UInt64?,
'ts_max':UInt64?,
'fielddate_tsmax':String?,
>;

$reducer3 = Python::tech_reducer(
    Callable<(String?, Stream<$in_type>)->Stream<$out_type>>,
    FileContent("sphere.py")
);

reduce $mobmetrika
presort `timestamp`
on yandexuid
using $reducer3(TableRow());
end define;

EXPORT $get_location_data, $get_technology_data;