#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import re
import urllib
import statbox.qb
import statbox.qb.filters2 as sf
import statbox.qb.extractors2 as se
from statbox import mrlib
from statbox import mrtools


STIDS_IGNORE_SET = set()
if os.path.isfile('./deleted_stids_ignore_list'):
    fh = open('./deleted_stids_ignore_list')
    STIDS_IGNORE_SET = set(fh.read().splitlines())
MULCA_STID_RE = re.compile(r'^(?:/gate)?/get/([^/\?]+)')


def get_stid_from_page(page):
    search_res = MULCA_STID_RE.search(page)
    if not search_res:
        return None
    stid = search_res.group(1)
    stid = urllib.unquote(stid)
    return stid.replace('\n', '').replace('\t', '')


def combined_raw_map_mulca(lines):
    """
    Предобрабатываем access-log мульки

    Запросы в мульке бывают такие:
        /gate/get/70829.251543590.1506429783171353703319430991632
        /get/70396.yadisk%3A534386.35740169577055678139495412501?raw&_X_Proxy_Client_Address=%3a%3affff%3a127.0.0.1&sign=1f57f7512073087bdb25654e86f9d2cc803c2c76b0bc38d38dc9bf4e956d8411
        /get/70829.251543590.1506429783171353703319430991632
        /get/71130.yadisk:29858957.3995732456156101061732043404183?raw
    """
    qb = statbox.qb.QB(
        log='access-log',
        fields=[
            'datetime',
            'page',
            'status',
            se.parameter('migrate'),
            se.parameter('yadisk_stid_check'),
            se.parameter('service'),
            se.custom('stid', get_stid_from_page, args='page')
        ],
        filters=[
            sf.defined('stid'),
            # фильтруем служебный запрос миграции
            sf.not_(sf.defined('migrate')),
            sf.not_(sf.equals('service', 'mds')),
            sf.not_(sf.equals('service', 'disk')),
            sf.not_(sf.defined('yadisk_stid_check')),
            sf.not_(sf.one_of('status', ['200', '206', 'result'])),
        ]
    )
    for r in qb.map_lines(lines):
        yield mrlib.DummyRecord(r['stid'], 'new', r['datetime'])


@mrlib.combiner(mrtools.uniq)
def reduce_intersection(stid, raw_records):
    """
    Ищем пересечения удаленных stid-ов и запрашиваемых stid-ов из мульки
    """
    remove_data = None
    request_dt = None

    for r in raw_records:
        if r.subkey == 'new':
            datetime = r.value
            if request_dt is None or datetime < request_dt:
                request_dt = datetime
        else:
            remove_data = r.value

    if request_dt:
        date = request_dt[:10]
        value = [0, 0]
        if remove_data and remove_data < request_dt:
            value[1] = 1

        # если везде понулям, то в отчете были нули, а не дырка
        # + uniq для уменьшения кол-ва dummy записей
        if sum(value) == 0 or stid in STIDS_IGNORE_SET:
            stid = 'dummy'
            value = [0, 0]
        yield mrlib.DummyRecord(date, stid, value)


def reduce_final(date, raw_records):
    dry_remove, real_remove = 0, 0
    for r in raw_records:
        d, r = eval(r.value)
        dry_remove += d
        real_remove += r

    yield mrlib.SimpleTSKVRecord(
        fielddate=date,
        dry_remove=dry_remove,
        real_remove=real_remove,
    )


if __name__ == '__main__':
    mrtools.run()
