USE hahn;
PRAGMA yt.DefaultMaxJobFails = "1";
-- PRAGMA yt.Pool = "portalytics-adhocs";
PRAGMA File("libra.so", "yt://current/statbox/resources/libra.so");
$libra_path = FilePath('libra.so');
$blockstat_bict_path = "yt://" || "hahn" || "/statbox/statbox-dict-last/blockstat.dict";
PRAGMA File("blockstat.dict", $blockstat_bict_path);
$day = CurrentUTCDate() - Interval('P1D');
$today = DateTime::Parse('%Y-%m-%d')(CAST ($day AS String));
PRAGMA yt.DefaultCalcMemoryLimit = "9G";
$numdays = 90;
-- $dates = ListMap(ListFromRange(1, $numdays), ($d) -> {
--     RETURN DateTime::Format('%Y-%m-%d')(DateTime::MakeDate(DateTime::Parse('%Y-%m-%d')(CAST ($day AS String))) - $d * Interval('P1D'));
-- });
----- Готовим python-функцию -----
-- Объявляем тип данных для YQL, аналогичный libra_parse_clicks.Row
$Row = Struct<uid: String, req_ts: Uint64, intrasearch_reqid: String, saas_reqid: String, scope: String, zone: String, wizard_name: String, text: String, url: String, position: Uint32, factors: Dict<String, Float>, dwell_time: Uint64, is_clicked_req: Bool, testid: List<String>, suggest_version: String,>;
----- Готовим основную коллекцию сессий ($sessions) -----
$libra_parser_python = @@
import libra
import urllib
from collections import defaultdict
from urlparse import parse_qsl

class Row(object):
    def __init__(self, **kwargs):
        for key, value in kwargs.items():
            setattr(self, key, value)

def parse_query(query):
    parsed = dict(parse_qsl(query))
    return parsed

def get_query_data(request):
    if not request.ProcessedQuery:
        return {}
    query = parse_query(request.ProcessedQuery)
    data = dict(
        zone=query.get('intrasearch-zone') or '',
        scope=query.get('intrasearch-scope') or '',
        reqid=query.get('intrasearch-reqid') or '',
        wizard_name=query.get('wizard_name') or '',
        suggest_version=query.get('itnrasearch-suggest-version') or '',
    )
    return data


def is_intrasearch(request):
    return request.SaasService.startswith('intrasearch') or request.SaasService == 'startrek'


def is_long_click(click, long_click_threshold=120):
    return click['dwell_time'] > long_click_threshold
    
def parse_sessions(resource):
    try:
        key, rcont = libra.ParseSessionsFromYQLResource(resource)

        for request in rcont.GetRequests():
            if not is_intrasearch(request):
                    continue

            data = get_query_data(request)
            if not data or not data['scope']:
                continue

            clicks = defaultdict(lambda: 0)
            for number, click in enumerate(request.GetClicks()):
                clicks[urllib.unquote(click.Url)] += click.DwellTime

            for position, block in enumerate(request.GetMainBlocks()):
                for ch in block.GetChildren():
                    try:
                        yield Row(
                            uid=key,
                            req_ts=long(request.Timestamp),
                            intrasearch_reqid=data['reqid'],
                            saas_reqid=request.ReqID,
                            scope=data['scope'],
                            zone=data['zone'],
                            wizard_name=data['wizard_name'],
                            text=request.Query,
                            url=ch.Url,
                            position=position,
                            suggest_version=data['suggest_version'],
                            dwell_time=clicks[ch.Url],
                            factors=dict(zip(request.RankingFactorNames, ch.RankingFactors)),
                            is_clicked_req=bool(clicks),
                            testid=[str(t.TestID) for t in request.GetTestInfo()])
                    except:
                        pass
    except:
        pass


@@;
$preprocess = LibraArcPython2::MakeLibraPreprocessor(AsStruct(FilePath("blockstat.dict") AS BlockstatDict));
$parse_sessions = ArcPython2::parse_sessions(Callable<(Resource<'LibraArcPython2EventsGroup'>) -> Stream<$Row>>, $libra_parser_python);
$reducer = ($key, $rows) -> {
    RETURN $parse_sessions($preprocess($key, $rows));
};
-- RANGE('home/goda/onlinemetrics/surplus/v9_1/daily', DateTime::Format('%Y%m%d')(CAST($date AS date)), DateTime::Format('%Y%m%d')(CAST($date AS date) + DateTime::IntervalFromSeconds(3*3600*24)))
$intrasearch_data =
    SELECT *
        FROM RANGE("//user_sessions/pub/search/daily/", DateTime::Format('%Y-%m-%d')(DateTime::MakeDate($today) - 90 * Interval('P1D')), DateTime::Format('%Y-%m-%d')(DateTime::MakeDate($today)), "columns/yandex_staff")
        WHERE SaasService LIKE '%intrasearch%';
$result =
    REDUCE $intrasearch_data
        PRESORT
            subkey
        ON
            key
        USING $reducer(TableRow());

INSERT INTO {{output1}}
WITH TRUNCATE 
SELECT * FROM $result as s
where scope == 'search';

