#!/usr/bin/env python
# coding=utf-8

import json
import math
import yt.wrapper as yt
import os
import string
import re

from nile.api.v1 import (
    filters as nf,
    Record,
    cli
)


def query_normalize_filter(query, type):
    regex = {
        'strict': u'[^а-яА-Яё ]+',
        'turkish': u'[^a-zA-ZçÇğĞıİöÖşŞüÜ ]+',
        'loose': u'[^a-zA-Zа-яА-ЯёЁ ]+',
        'belarusian': u'[^a-zA-Zа-яА-ЯёЁІіЎў ]+',
        'ukrainian': u'[^a-zA-Zа-яА-ЯёЁҐґІіЇїЄє ]+',
    }[type]

    if not isinstance(query, str):
        try:
            query = query.encode('utf8')
        except Exception:
            return False
    try:
        query = query.translate(
            None, string.punctuation
        ).decode("utf8")

        matches = re.search(
            regex, query
        )
    except (TypeError, ValueError, AttributeError):
        return "", False

    return " ".join(query.split()).lower(), not matches


def get_country(region_id, geobase):
    if region_id not in geobase:
        return None
    if geobase[region_id]['type'] < 3:
        return None
    if geobase[region_id]['type'] == 3:
        return None

    for tree_id in geobase[region_id]['path']:
        if tree_id in geobase and geobase[tree_id]['type'] == 3:
            return geobase[tree_id]['iso_name'].split()[0].upper()

    return None


def get_geobase(path='geobase.json'):
    geobase_raw = json.load(open(path, 'r'))
    geobase = {}
    for elem in geobase_raw:
        elem_id = int(elem['id'])
        geobase[elem_id] = {
            'type': int(elem['type']),
            'id': elem_id,
            'path': [int(tree_id) for tree_id in elem['path'].split(', ') if tree_id],
            'iso_name': elem['iso_name'],
            'name': elem['name']
        }
    return geobase


class NanoSessionsMapper(object):
    def __init__(self, geobase_json, nano_sessions_service):
        self.nano_sessions_service = nano_sessions_service
        self.country_dict = {
            "RU": "loose",
            "BY": "belarusian",
            "UA": "ukrainian",
            "KZ": "loose",
            "UZ": "loose",
            "AZ": "loose",
            "AM": "loose",
            "GE": "loose",
            "IL": "loose",
            "KG": "loose",
            "LV": "loose",
            "LT": "loose",
            "MD": "loose",
            "TJ": "loose",
            "TM": "loose",
            "EE": "loose"
        }

        self.exUSSR = ["AZ", "AM", "GE", "IL", "KG", "LV", "LT", "MD", "TJ", "TM", "EE"]

        self.geobase = {}

        for elem in geobase_json:
            elem_id = int(elem['id'])
            self.geobase[elem_id] = {
                'type': int(elem['type']),
                'id': elem_id,
                'path': [int(tree_id) for tree_id in elem['path'].split(', ') if tree_id],
                'iso_name': elem['iso_name'],
                'name': elem['name']
            }

    def __call__(self, recs):
        for rec in recs:
            try:
                nano_session = json.loads(rec["value"])
            except ValueError:
                continue

            if not self.check_nano_session(nano_session):
                continue

            query_region_id = self.get_region(nano_session)
            query_country = self.get_country(query_region_id)
            if query_country not in self.country_dict:
                continue

            query_text = self.get_query(nano_session)
            if query_country not in self.country_dict:
                continue

            norm_query, filter_res = query_normalize_filter(query_text, self.country_dict[query_country])
            if not filter_res:
                continue

            if norm_query != "" and query_region_id != "" and query_country != "" and rec.get("subkey", "") != "":
                yield Record(
                    query_text=norm_query,
                    query_region_id=query_region_id,
                    query_country=query_country,
                    country_group_p=query_country if query_country not in self.exUSSR else "exUSSR",
                    query_device=self.get_device(nano_session),
                    timestamp=rec["subkey"],
                    platform="desktop" if self.get_device(nano_session) == "DESKTOP" else "touch"
                )

    def check_nano_session(self, nano_session):
        if self.nano_sessions_service == "web":
            if nano_session.get("page_no", 1) != 0 or \
                    nano_session.get("query", None) is None or \
                    nano_session.get("os_family", None) is None or \
                    nano_session.get("user_region", None) is None:
                return False
        elif self.nano_sessions_service == "images":
            if nano_session.get("page_num", 1) != 0 or \
                    nano_session.get("request", None) is None or \
                    nano_session.get("user_agent", None) is None or \
                    nano_session.get("geo_region", None) is None:
                return False
        return True

    def get_region(self, nano_sessions):
        if self.nano_sessions_service == "web":
            return nano_sessions["user_region"]
        elif self.nano_sessions_service == "images":
            return nano_sessions["geo_region"]

    def get_country(self, region_id):
        if region_id not in self.geobase:
            return None
        if self.geobase[region_id]['type'] < 3:
            return None
        if self.geobase[region_id]['type'] == 3:
            return None

        for tree_id in self.geobase[region_id]['path']:
            if tree_id in self.geobase and self.geobase[tree_id]['type'] == 3:
                return self.geobase[tree_id]['iso_name'].split()[0].upper()

        return None

    def get_query(self, nano_sessions):
        if self.nano_sessions_service == "web":
            return nano_sessions["query"]
        elif self.nano_sessions_service == "images":
            return nano_sessions["request"]

    def get_device(self, r):
        if self.nano_sessions_service == "web":
            if "WindowsPhone" in r.get("os_family", ""):
                return "WINDOWS PHONE"
            elif "Android" in r.get("os_family", ""):
                return "ANDROID"
            elif "iOS" in r.get("os_family", ""):
                return "IPHONE"
            else:
                return "DESKTOP"
        elif self.nano_sessions_service == "images":
            if "Windows Phone" in r.get("user_agent", ""):
                return "WINDOWS PHONE"
            elif "Android" in r.get("user_agent", ""):
                return "ANDROID"
            elif "iPhone" in r.get("user_agent", ""):
                return "IPHONE"
            else:
                return "DESKTOP"


def extract_queries(job, options_json, config_json, geobase_json, output_table):
    yt.config["proxy"]["url"] = options_json["yt_cluster"]

    prepeared_data = []
    for nano_sessions_service in options_json["nano_sessions_services"]:
        nano_sessions_tables = []
        nano_sessions_dates = sorted(map(str, yt.list(options_json["nano_sessions_tables_path"])))

        if "date_from" in options_json and "date_to" in options_json:
            nano_sessions_dates = filter(lambda x: options_json["date_from"] <= x <= options_json["date_to"],
                                         nano_sessions_dates)

        if "skip_last_n_days" in options_json and int(options_json["skip_last_n_days"]) != 0:
            nano_sessions_dates = nano_sessions_dates[: -int(options_json["skip_last_n_days"])]
        if "last_n_days" in options_json and int(options_json["last_n_days"]) != 0:
            nano_sessions_dates = nano_sessions_dates[
                                  -(int(options_json["last_n_days"])):]

        for nano_session_date in nano_sessions_dates:
            nano_sessions_tables.append(
                job.table("{}/{}/{}/{}".format(
                    options_json["nano_sessions_tables_path"],
                    nano_session_date,
                    nano_sessions_service,
                    options_json["nano_sessions_type"]
                )
                )
            )

        prepeared_data.append(job.concat(*nano_sessions_tables).map(NanoSessionsMapper(geobase_json, nano_sessions_service)))

    work_table = job.concat(*prepeared_data)

    if yt.exists(options_json["db_path"]):
        db_table = job.table(options_json["db_path"])
        if options_json["exclude_selected"]:
            work_table = work_table.join(db_table, by=["query_text", "platform", "query_region_id"], type="left_only")
    else:
        db_table = None

    work_table = work_table.groupby("query_country", "platform") \
        .random(int(options_json["query_count"]), memory_limit=1024)

    output_tables = []
    for config in config_json:
        output_tables.append(
            work_table.filter(nf.equals("country_group_p", config.get("country", ""))) \
                .filter(nf.equals("platform", config.get("platform", ""))) \
                .random(int(math.ceil(int(options_json["query_count"]) * int(config["weight"]) / 100.0))) \
                .project(
                    "query_text",
                    "query_country",
                    "query_region_id",
                    "query_country",
                    "query_device",
                    "timestamp",
                    "platform"
                )
        )

    result_table = job.concat(*output_tables) \
        .put(output_table)

    if db_table is not None:
        result_table \
            .join(db_table, by=["query_text", "platform", "query_region_id"], type="left_only") \
            .put(options_json["db_path"], append=True)
    else:
        result_table.put(options_json["db_path"], append=True)


@cli.statinfra_job
def make_job(job, nirvana):
    options_json = json.load(open(os.path.join(os.getcwd(), "_files_1_"), "r"))
    config_json = json.load(open(os.path.join(os.getcwd(), "_files_2_"), "r"))
    geobase_json = json.load(open(os.path.join(os.getcwd(), "_files_3_"), "r"))
    output_table = nirvana.output_tables[0]

    extract_queries(job, options_json, config_json, geobase_json, output_table)

    return job


if __name__ == "__main__":
    cli.run()
