# -*- coding: utf-8 -*-
import logging
from sandbox.projects.common import error_handlers as eh


_FACTOR_START_FIELD = 5  # Number of field with first factor
_FACTOR_START_FIELD_TSVPOOL = 11  # Number of field with first factor


def _parse(file_name, callback, is_dbg_dolbilka, factors_count=None, is_est_features=False):
    for line in open(file_name):
        if not line:
            continue

        try:
            parts = line.split("\t")
            if factors_count is not None and factors_count + _FACTOR_START_FIELD != len(parts):
                eh.check_failed("Invalid number of fields")

            query_number = int(parts[0]) if not is_est_features else str(parts[0])  # int tsv pool requestId is size_t, too big for python int
            relev = int(parts[1]) if not is_est_features else float(parts[1])

            if not is_est_features and not parts[2].isdigit():
                eh.check_failed("No support for old dolbilka output format")

            docid = parts[2] if not is_est_features else 0

            url = parts[4] if not is_est_features else parts[3]

            start_index = _FACTOR_START_FIELD if not is_est_features else _FACTOR_START_FIELD_TSVPOOL
            factors = [float(factor) for factor in parts[start_index:]]
        except Exception:
            eh.check_failed("Cannot parse line '{}', error: {}".format(line, eh.shifted_traceback()))

        callback(query_number, relev, docid, url, factors)


def parse_and_group_by_query(file_name, is_dbg_dolbilka, factors_count=None, is_est_features=False):
    """
        dbg_dolbilka/idx_ops output parser

        NOTE: It is slow, do not try to apply it to files larger than 1G
        as it loads all output to memory.
    """

    logging.info("Started parsing output '%s' and grouping by query", file_name)

    class UrlData:
        def __init__(self, relev, factors):
            self.relev = relev
            self.factors = factors

    # queries[query_number] -> urls[string with url and docid] -> UrlData()
    result = {}

    def _process_line(query_number, relev, docid, url, factors):
        result.setdefault(query_number, {"{} {}".format(docid, url): UrlData(relev, factors)})

    _parse(file_name, _process_line, is_dbg_dolbilka, factors_count=factors_count, is_est_features=is_est_features)

    return result
