#!/usr/bin/python
# coding=utf-8
import re
import sys
import urllib

import luigi
import yt.wrapper as yt

from lib.luigi import yt_luigi
from rtcconf import config
from utils import mr_utils as mr
from v2 import ids


RTB_SCHEMA = {
    'schema': [
        {'name': ids.CRYPTA_DEVICE_ID, 'type': 'string'},
        {'name': 'ts', 'type': 'uint64'},
        {'name': 'ssp_app', 'type': 'string', },
        {'name': 'os', 'type': 'string', },
    ], 'optimize_for': 'scan'
}


def is_good_ssp_app(app):
    bad_syms = '-/:[]?,"\'!;\\'
    only_digits = r'^\d+$'
    for s in bad_syms:
        if s in app:
            return False

    if re.match(only_digits, app):
        return False

    return True


# rtb-log : ... queryargs: ... & 337=IDFA or GAID & 440 = app_id(url encoded) $ ...
# 337 - id; 329 - os; 440 - app name
def map_rtb_log(rec):
    queryargs = rec.get('queryargs')
    if not queryargs:
        return

    tskv = dict(filter(
        lambda x: len(x) == 2, (p.split('=', 1) for p in queryargs.split('&'))))
    if all(k in tskv and tskv[k] for k in ['337', '440']):
        devid = tskv['337']
        app = urllib.unquote(tskv['440'])
        os = urllib.unquote(tskv.get('329', '')).lower()
        if os in {'ios', 'android'} and is_good_ssp_app(app):
            yield {ids.CRYPTA_DEVICE_ID: devid, 'ssp_app': app, 'ts': 0, 'os': os}


class ImportRTBLogDayTask(yt_luigi.BaseYtTask):

    date = luigi.Parameter()
    run_date = luigi.Parameter()

    priority = 1

    def input_folders(self):
        return {
            'rtb_log': config.STATBOX_RTB_LOG_FOLDER
        }

    def output_folders(self):
        return {
            'mobile': config.YT_OUTPUT_FOLDER + self.date + '/mobile/',
        }

    def requires(self):
        return yt_luigi.ExternalInput(self.in_f('rtb_log') + self.date)

    def run(self):
        input_table = self.in_f('rtb_log') + self.date
        output_table = self.out_f('mobile') + 'rtb_log_apps'

        yt.create(
            'table', output_table,
            ignore_existing=True, recursive=True, attributes=RTB_SCHEMA, )

        yt.run_map(
            map_rtb_log,
            input_table,
            output_table,
            spec=mr.DATA_SIZE_PER_JOB_2GB_SPEC
        )

        yt.run_sort(output_table, sort_by=[ids.CRYPTA_DEVICE_ID, 'ts'], )

    def output(self):
        return [yt_luigi.YtTarget(self.out_f('mobile') + 'rtb_log_apps')]


if __name__ == '__main__':

    yt.run_map(map_rtb_log, "//statbox/bs-rtb-log/2018-08-05", "//home/crypta/team/artembelov/rtb_log_parse_error")
