#!/usr/bin/env python3
import hashlib

SESSION_THRESH = 30 * 60  # 30 minutes
UINT32_MAX = 2**32 - 1
NORM_QUERY_LENGTH = 500
UNKNOWN_CLASS = "UNKNOWN"

def make_class(info):
    if info.host is None:
        return "NATIVE"
    elif info.name is not None and info.name != UNKNOWN_CLASS:
        return "{} {}".format(info.name, info.type)
    else:
        return info.host


def norm_query(query):
    return query[:NORM_QUERY_LENGTH]


class State:
    def __init__(self, day, src, uid, event, current_state=None):
        self.day = day
        self.src = src
        self.uid = uid

        if current_state is None or event.unixtime - current_state.session_start > SESSION_THRESH:
            self.session_id = hashlib.md5((uid + str(event.unixtime)).encode()).hexdigest()
        else:
            self.session_id = current_state.session_id

        self.os = event.os
        self.prev_ts = event.unixtime
        self.session_start = event.unixtime
        self.ui = event.ui
        self.user_region = event.user_region
        self.useragent = event.useragent

        self.enter_class = make_class(event.referer_info)
        self.enter_host = event.referer_info.host
        self.enter_url = event.referer_info.url

        self.go_url = event.url_info.url
        self.go_host = event.url_info.host
        self.go_class = make_class(event.url_info)

        self.hits = 0 if event.url_info.is_search else 1

        self.prev_class = self.go_class
        self.prev_host = self.go_host
        self.prev_is_search = event.url_info.is_search

        self.query_list = []
        if  event.url_info.is_search and event.url_info.query:
            self.query_list = [event.url_info.query]

        self.referer_query = None
        if event.referer_info.is_search and event.referer_info.query:
            self.referer_query = event.referer_info.query

    @property
    def duration(self):
        return self.prev_ts - self.session_start

    @property
    def is_valid(self):
        return self.duration <= UINT32_MAX

    @property
    def as_dict(self):
        return dict(
            day=self.day,
            src=self.src,
            user_id=self.uid,
            start_ts=self.session_start,

            duration=self.duration,
            hits=self.hits,
            os=self.os,
            session_id=self.session_id,
            ui=self.ui,
            user_region=self.user_region,
            useragent=self.useragent,

            enter_host=self.enter_host,
            enter_point=self.enter_url,
            enter_type=self.enter_class,

            end_host=self.go_host,
            end_point=self.go_url,
            end_type=self.go_class,

            norm_query=norm_query(self.query_list[0]) if self.query_list else "",
            query_count=len(self.query_list),
            query_list=self.query_list[:5],
            query_text=self.query_list[0] if self.query_list else "",
            referer_query=self.referer_query,
        )


def make_session(key, recs):
    state = None
    day, src, uid = key

    for event in recs:
        if state is None:
            # инит сессии в зависимости от класса
            state = State(day, src, uid, event)
            continue

        if state.prev_is_search and make_class(event.url_info) == state.prev_class:
            state.prev_ts = event.unixtime
            # скипаем всё внутри этого поисковика, только запросы запоминаем
            if event.url_info.query:
                state.query_list.append(event.url_info.query)

        elif not state.prev_is_search and state.prev_host == event.url_info.host:  # переход внутри одного сайта
            state.prev_ts = event.unixtime
            state.hits += 1

        else:  # переход между сайтами, разрываем сессию
            # вывели старую, начинаем новую
            if state.is_valid:
                yield state.as_dict

            state = State(day, src, uid, event, state)

    # вывели конец
    if state.is_valid:
        yield state.as_dict


class UrlInfo:
    def __init__(self, raw_info):
        self.is_search = self.unwrap(raw_info[1])
        self.name = self.unwrap(raw_info[2])
        self.query = self.unwrap(raw_info[3])
        self.type = self.unwrap(raw_info[4])
        self.url = self.unwrap(raw_info[5])
        self.host = self.unwrap(raw_info[0])

    @staticmethod
    def unwrap(value):
        if isinstance(value, list):
            return value[0]
        return value


class Event:
    def __init__(self, row):
        self.day = row.get("day")
        self.url_info = UrlInfo(row["url_info"])
        self.os = row["os"]
        self.referer_info = UrlInfo(row["referer_info"])
        self.src = row["src"]
        self.ui = row["ui"]
        self.uid = row["uid"]
        self.unixtime = row["unixtime"]
        self.url = row["url"]
        self.user_region = row["user_region"]
        self.useragent = row["useragent"]


# yt read //home/searchshare/ivankun/squeeze/spylog/events/2021-08-15[y3103517860150206701:y3103517860150206702] --format yson > /tmp/events.yson

def main():
    import itertools
    import json
    import yt.yson as yson

    events = [Event(yson.yson_to_json(r)) for r in yson.load(open('/tmp/events.yson', 'rb'), yson_type="list_fragment")]
    events.sort(key=lambda x: (x.day, x.src, x.uid, x.unixtime))
    sessions = []
    for key, group in itertools.groupby(events, key=lambda x: (x.day, x.src, x.uid)):
        print(key)
        for session in make_session(key, group):
            sessions.append(session)
            if session["query_text"]:
                print(session["query_text"])
    print(len(sessions))

    json.dump(sessions, open("lib/sessions.json", "w"))


if __name__ == "__main__":
    main()
