#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
Parse 5-min reqans log and prepare daily tables

More info about reqans log: https://wiki.yandex-team.ru/jandekspoisk/dokumentacija/poiskovyelogi/
"""
from crypta.profile.runners.log_parsing.lib.base_parser import LogParser
from crypta.profile.utils.config import config

reqans_query = """
PRAGMA yt.DefaultMemoryLimit = '2048M';
PRAGMA yt.AutoMerge = 'disabled';
PRAGMA yson.DisableStrict;

$req = ($msp, $query) -> {{
    RETURN CASE
        WHEN $msp IS NOT NULL AND $msp.Relev >= 8000
        THEN $msp.CorrectedQuery
        ELSE $query
    END
}};

$process_doc = ($doc) -> {{
    RETURN CryptaUrlUtils::ExtractHost($doc.Url);
}};


INSERT INTO `{intermediate_table}` WITH TRUNCATE
SELECT
    CAST(UserRegionId AS Uint64) AS region_id,
    Query AS query,
    _logfeller_timestamp AS `timestamp`,
    CAST(UserId.YandexUid AS Uint64) AS yandexuid,
    CAST(UserId.ICookie AS Uint64) AS icookie,
    CASE
        WHEN $req(Msp, Query) IS NOT NULL AND LENGTH($req(Msp, Query)) < 65536
        THEN Crypta::GetNotUniqueLemmas(SearchRequest::NormalizeSimple(CAST($req(Msp, Query) AS Utf8)), UiLanguage)
        ELSE NULL
    END AS lemmas,
    ListTake(
        ListFilter(ListFlatMap(Documents, $process_doc), ($host) -> {{RETURN $host != '';}}),
        10
    ) AS ans_hosts,
FROM $input
WHERE CAST(UserId.YandexUid AS Uint64) IS NOT NULL AND CAST(UserId.YandexUid AS Uint64) != 0"""


class ReqansParser(LogParser):
    def __init__(self):
        super(ReqansParser, self).__init__(
            log_name='reqans',
            log_dir=config.REQANS_LOG_DIRECTORY,
            output_schema={
                'yandexuid': 'uint64',
                'timestamp': 'uint64',
                'query': 'string',
                'lemmas': 'any',
                'ans_hosts': 'any',
                'region_id': 'uint64',
                'icookie': 'uint64',
            },
            query=reqans_query,
            title='log_parsing {}'.format(self.__class__.__name__),
            udf_resource_dict={
                'libcrypta_url_utils_udf.so': config.CRYPTA_URL_UTILS_UDF_RESOURCE,
            },
        )


def main():
    reqans_parser = ReqansParser()
    reqans_parser.run()


if __name__ == '__main__':
    main()
