-- ADCAMPAIGN-65: прогноз через похожих
$CLIENT_VEC = "//home/vipplanners/insights/data/clientid_cost_vectors/weeks";
$TICKET_WEEKS = "//home/vipplanners/insights/data/startrek/tables/offers";
$RESULT = "//home/vipplanners/insights/data/clientid_impact_vectors/weeks";

-- Количество похожих рекламодателей
$CLUSTER_SIZE = 4;
-- Размер периода для отбора похожих
-- Недели из https://st.yandex-team.ru/ADVERTANALYTICS-5174#5d0b76c5c68ba400205b95b9
$HISTORY_MAX_SIZE = 12;
$FUTURE_MAX_SIZE = 12;


USE hahn;

$parse_week = ($x) -> {
  $date = CAST($x AS Date);
  $year = DateTime::GetYear($date);
  $week = DateTime::GetWeekOfYear($date);
  RETURN CAST($year * 100 + $week AS Uint32);
};

$week_from_mls = ($mls) -> {
  $date = DateTime::FromMilliseconds($mls);
  $year = DateTime::GetYear($date);
  $week = DateTime::GetWeekOfYear($date);
  RETURN CAST($year * 100 + $week AS Uint32);
};

$CLIENT_ID_WEEK = (
SELECT 
  client_id
, task_activation_week AS week
, AGGREGATE_LIST(key) AS keys
-- для одной недели клиента показатели должны совпадать
-- кол-во недель с прошдыми затратами позволяет отбирать похожих
, SOME(is_good_history) AS is_good_history
, SOME(is_new_client) AS is_new_client
FROM $TICKET_WEEKS
WHERE 
-- Проверка на возраст последнего контакта 8_week_OK
-- Берем в замер только задачи с возрастом контанкта не менее 8 недель.
      is_good_age
  -- нет смысла смотреть свежие тикеты, по ним нет статистики
  AND (CurrentUtcDateTime() - DateTime::FromMilliseconds(task_activation_mls)) > Interval("P7D")
GROUP BY
  client_id
, task_activation_week
);

-- будут использоваться для прогноза по ближайшим
$CLIENT_ID_WEEK_T = (
SELECT AsTuple(client_id, week)
FROM $CLIENT_ID_WEEK
WHERE is_good_history
);

-- будут использоваться для прогноза через себя
$NOVICE_WEEK = (
SELECT 
  client_id
, week
, is_new_client
, keys
FROM $CLIENT_ID_WEEK
WHERE NOT is_good_history OR is_new_client
);

-- Разбивка вектора по клиенту по cube_week
-- history_vec — информация известная на cube_week
-- future_vec — информация из будущего
INSERT INTO @client_week_cost_vec WITh TRUNCATE
SELECT 
  client_id
, cube_week
-- порядок по возрасту от cube_week, от меньшего к большему
, future_vec
-- порядок по возрасту от cube_week, от меньшего к большему
, history_vec
FROM $CLIENT_VEC AS c
LEFT SEMI JOIN $CLIENT_ID_WEEK AS wl
 ON c.cube_week = wl.week
; COMMIT;

$euclidian_dist_at = ($xs, $ys, $n) -> {
  RETURN Math::Sqrt(
    ListSum(
      ListMap(
        ListTake(ListZipAll($xs, $ys), $n)
        , ($pair) -> {RETURN Math::Pow(Coalesce($pair.0, 0.0) - Coalesce($pair.1, 0.0), 2)}
      )
    )
  );
};


PRAGMA yt.DataSizePerJob = "100M";
PRAGMA yt.DefaultMemoryLimit = "24G";

INSERT INTO @nearest WITH TRUNCATE
SELECT
  test_impact_week
, test_client_id
-- Кол-во недель и метрика из https://st.yandex-team.ru/ADVERTANALYTICS-5174#5d0b76c5c68ba400205b95b9
-- +1 что бы захватить текущую дату
, BOTTOM_BY(y.client_id, $euclidian_dist_at(x.history_vec, y.history_vec, $HISTORY_MAX_SIZE + 1), $CLUSTER_SIZE) AS validate_client_ids
FROM @client_week_cost_vec AS x
CROSS JOIN @client_week_cost_vec AS y
WHERE x.client_id < y.client_id
  AND x.cube_week = y.cube_week
  AND AsTuple(x.client_id, x.cube_week) IN COMPACT $CLIENT_ID_WEEK_T
GROUP BY 
  x.client_id AS test_client_id
, x.cube_week AS test_impact_week
;

INSERT INTO @novice_impact WITH TRUNCATE
SELECT
  test_client_id
, test_impact_week
, test_history_vec
, test_future_vec
, IF(
    is_new_client,
    ListSum(test_future_vec) ?? 0,
    ListSum(ListMap(
      ListZip(test_future_vec, ListReplicate(test_wavg_history, ListLength(test_future_vec) ?? 0)),
      ($pair) -> {RETURN NVL($pair.0, 0) - NVL($pair.1, 0)}
    ))
  ) AS test_self_impact
FROM (
  SELECT
    v.client_id AS test_client_id
  , v.cube_week AS test_impact_week
  , v.history_vec AS test_history_vec
  , v.future_vec AS test_future_vec
  -- если не пропускать, то в оценку средненедельного попадет неделя активации
  , ListAvg(ListSkip(v.history_vec, 1)) AS test_wavg_history
  , n.is_new_client AS is_new_client
  FROM @client_week_cost_vec AS v
  JOIN $NOVICE_WEEK AS n
    ON v.client_id = n.client_id
   AND v.cube_week = n.week
)
; COMMIT;


PRAGMA yt.DataSizePerJob = default;
PRAGMA yt.DefaultMemoryLimit = default;

INSERT INTO @test_val_vec WITH TRUNCATE
SELECT 
  test_client_id
, test_impact_week
, SOME(v2.future_vec) AS test_future_vec
, SOME(v2.history_vec) AS test_history_vec
, AGGREGATE_LIST(validate_client_id) AS validate_client_ids
, AGGREGATE_LIST(v1.future_vec) AS validate_future_vec
, AGGREGATE_LIST(v1.history_vec) AS validate_history_vec
FROM (
  SELECT test_impact_week, test_client_id, validate_client_id
  FROM @nearest
  FLATTEN LIST BY validate_client_ids AS validate_client_id
) AS c
JOIN $CLIENT_VEC AS v1
  ON c.validate_client_id = v1.client_id
  AND c.test_impact_week = v1.cube_week
JOIN $CLIENT_VEC AS v2
  ON c.test_client_id = v2.client_id
  AND c.test_impact_week = v2.cube_week
GROUP BY 
  c.test_client_id AS test_client_id
, c.test_impact_week AS test_impact_week
; COMMIT;

$PY_STAT_SRC = @@
from itertools import zip_longest, accumulate
from statistics import mean, median

def mmean(vecs):
    return [mean(x) for x in zip_longest(*vecs, fillvalue=0.0)]

def mmedian(vecs):
    return [median(x) for x in zip_longest(*vecs, fillvalue=0.0)]
@@;

$mmean = ArcPython3::mmean(Callable<(List<List<Uint32>>)->List<Double>>, $PY_STAT_SRC);
$mmedian = ArcPython3::mmedian(Callable<(List<List<Uint32>>)->List<Double>>, $PY_STAT_SRC);
$median = ArcPython3::median(Callable<(List<Double>)->Double>, $PY_STAT_SRC);
$accum_impact = ArcPython3::accumulate(Callable<(List<Double>)->List<Double>>,$PY_STAT_SRC);

$calc_impact = ($fact, $pred) -> {
  RETURN ListMap(
    ListZipAll($fact, $pred),
    ($fp) -> {RETURN Coalesce(Coalesce($fp.0, 0.0) - Coalesce($fp.1, 0.0), 0.0)}
  );
};

$weekly_diff_of_diff = ($test_hist, $test_fut, $val_hist, $val_fut) -> {
  $test_diff = Coalesce(ListAvg($test_fut), 0.0) - Coalesce(ListAvg($test_hist), 0.0);
  $val_diff = Coalesce(ListAvg($val_fut), 0.0) - Coalesce(ListAvg($val_hist), 0.0);
  RETURN Unwrap($test_diff - $val_diff);
};

INSERT INTO $RESULT WITH TRUNCATE
SELECT
  ci.test_client_id AS test_client_id
, st.keys AS ticket_keys
, ci.test_impact_week AS test_impact_week
, IF(
    ci.test_impact_vec IS NOT NULL,
    $accum_impact(Unwrap(ci.test_impact_vec)),
    Nothing(List<Double>?)
  ) AS test_accum_impact_vec
, ci.test_impact_vec AS test_impact_vec
, ci.test_mean_dd_impact AS test_mean_dd_impact
, ci.test_median_dd_impact AS test_median_dd_impact
, ci.test_mean_dd_full_impact AS test_mean_dd_full_impact
, ci.test_median_dd_full_impact AS test_median_dd_full_impact
, ci.test_wavg_dd_impact AS test_wavg_dd_impact
, ci.test_wavg2_dd_impact AS test_wavg2_dd_impact
, ci.validate_mean_future_vec AS validate_mean_future_vec
, ci.validate_median_future_vec AS validate_median_future_vec
, ci.validate_mean_history_vec AS validate_mean_history_vec
, ci.validate_median_history_vec AS validate_median_history_vec
, ci.test_future_vec AS test_future_vec
, ci.test_history_vec AS test_history_vec
, ci.validate_client_ids AS validate_client_ids
, ci.test_self_impact AS test_self_impact
, st.is_good_history AS is_good_history
, st.is_new_client AS is_new_client
FROM (
  SELECT
    test_client_id
  , test_impact_week
  , test_future_vec
  , test_history_vec
  , Unwrap($calc_impact(test_future_vec, validate_mean_future_vec)) AS test_impact_vec
  , $weekly_diff_of_diff(
    test_history_vec, test_future_vec,
    validate_mean_history_vec, validate_mean_future_vec
  ) * ListLength(test_future_vec) AS test_mean_dd_impact
  , $weekly_diff_of_diff(
    test_history_vec, test_future_vec,
    validate_median_history_vec, validate_median_future_vec
  ) * ListLength(test_future_vec) AS test_median_dd_impact
  , $weekly_diff_of_diff(
    test_history_vec, test_future_vec,
    validate_mean_history_vec, validate_mean_future_vec
  ) * $FUTURE_MAX_SIZE AS test_mean_dd_full_impact
  , $weekly_diff_of_diff(
    test_history_vec, test_future_vec,
    validate_median_history_vec, validate_median_future_vec
  ) * $FUTURE_MAX_SIZE AS test_median_dd_full_impact
  , validate_mean_future_vec
  , validate_median_future_vec
  , validate_mean_history_vec
  , validate_median_history_vec
  , validate_client_ids
  , ((test_wavg_future - test_wavg_history) - (validate_wavg_future - validate_wavg_history)) * ListLength(test_future_vec) AS test_wavg_dd_impact
  , ((test_wavg_future - test_wavg_history) - $median(
      $calc_impact(validate_wavg_future_vec, validate_wavg_history_vec))
  ) * ListLength(test_future_vec) AS test_wavg2_dd_impact
  FROM (
    SELECT 
      test_client_id
    , test_impact_week
    , test_future_vec
    , test_history_vec
    , ListAvg(test_future_vec) AS test_wavg_future
    , ListAvg(test_history_vec) AS test_wavg_history
    , $mmean(validate_future_vec) AS validate_mean_future_vec
    , $mmedian(validate_future_vec) AS validate_median_future_vec
    , $mmean(validate_history_vec) AS validate_mean_history_vec
    , $mmedian(validate_history_vec) AS validate_median_history_vec
    , $median(ListMap(validate_history_vec, ($xs) -> {RETURN ListAvg($xs) ?? 0.0})) AS validate_wavg_history
    , $median(ListMap(validate_future_vec, ($xs) -> {RETURN ListAvg($xs) ?? 0.0})) AS validate_wavg_future
    , ListMap(validate_history_vec, ($xs) -> {RETURN ListAvg($xs)}) AS validate_wavg_history_vec
    , ListMap(validate_future_vec, ($xs) -> {RETURN ListAvg($xs)}) AS validate_wavg_future_vec
    , validate_client_ids
    FROM @test_val_vec
  )
  UNION ALL
  SELECT * 
  FROM @novice_impact
) AS ci
LEFT JOIN $CLIENT_ID_WEEK AS st
   ON ci.test_client_id = st.client_id
  AND ci.test_impact_week = st.week
ORDER BY test_impact_week, test_client_id
;
