import re
from hashlib import md5
from collections import deque, namedtuple
from datetime import datetime, timedelta
from datacloud.dev_utils.yt.yt_utils import get_yt_client
from datacloud.features.contact_actions.build_config import CONTACT_LOG_DIR
from datacloud.dev_utils.yt import features


MAX_WINDOW_ACTIONS = 100
RE_PHONE = re.compile(br'(?:.*[^0-9]|^)(?:7|8|) ?-?[(]?(9[0-9]{2})[)]? ?-?([0-9]{3}) ?-?([0-9]{2}) ?-?([0-9]{2})(?:[^0-9@].*|$)')
RE_EMAIL = re.compile(br'(?:.*[^a-zA-Z0-9_.+-]|^)([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+[.][a-zA-Z0-9-.]+)$')
CONTACT_SCHEMAS = [
    b'tel',
    b'mailto',
    b'tg',
    b'sms',
    b'whatsapp',
    b'viber',
    b'callto',
    b'skype',
]

Action = namedtuple('Action', ['subkey', 'url', 'title', 'referer'])

WL_DIR = '//user_sessions/pub/watch_log_tskv/daily'


def run_grep(date_from, date_to, yql_client=None, yt_client=None, use_cloud_nodes=True):
    yt_client = yt_client or get_yt_client()

    log_tables = ['{}/{}/clean'.format(WL_DIR, date) for date in date_range(date_from, date_to)]

    result_table = yt_client.TablePath(
        '{}/{}'.format(CONTACT_LOG_DIR, date_to),
        attributes=dict(
            schema=[
                {'name': 'yuid', 'type': 'string'},
                {'name': 'ts', 'type': 'uint64'},
                {'name': 'id_value', 'type': 'string'},
                {'name': 'id_type', 'type': 'string'},
                {'name': 'id_value_md5', 'type': 'string'},
                {'name': 'dwelltime', 'type': 'uint64'},
                {'name': 'url', 'type': 'string'},
                {'name': 'title', 'type': 'string'},
                {'name': 'referer', 'type': 'string'},
                {'name': 'parent_url', 'type': 'string'},
                {'name': 'parent_title', 'type': 'string'},
            ],
            compression_codec='brotli_6',
            optimize_for='scan',
        )
    )

    with yt_client.Transaction():
        yt_client.run_reduce(
            parse_contact_actions,
            log_tables,
            result_table,
            reduce_by=['key'],
            sort_by=['key', 'subkey'],
            spec=dict(
                title='grep contact actions log',
                **features.cloud_nodes_spec(use_cloud_nodes)
            ),
        )

        yt_client.run_sort(
            result_table,
            sort_by=['id_value_md5', 'ts'],
            spec=dict(
                title='sort contact actions log',
                **features.cloud_nodes_spec(use_cloud_nodes)
            ),
        )

        yt_client.run_merge(
            result_table,
            result_table,
            spec=dict(
                title='merge contact actions log',
                combine_chunks=True
            )
        )


def parse_contact_actions(key, recs):
    for action, previous_actions, next_action in add_prevs_and_next(parse_value(recs)):
        if is_contact_schema(action.url):
            id_value, id_type = match_id_value(action.url)
            if id_value:
                parent_url, parent_title = find_referer(action.referer, previous_actions)
                yield {
                    'yuid': key['key'][1:],  # del first char "y" from yuid
                    'ts': int(action.subkey),
                    'id_value': id_value,
                    'id_type': id_type,
                    'id_value_md5': md5(id_value).hexdigest().encode('utf-8'),
                    'dwelltime': int(next_action.subkey) - int(action.subkey) if next_action else None,
                    'url': action.url,
                    'title': action.title,
                    'referer': action.referer,
                    'parent_url': parent_url,
                    'parent_title': parent_title,
                }


def find_referer(referer, window):
    if referer:
        for action in window:
            if action.url and referer in action.url:
                return action.url, action.title
    return (None, None)


def add_prevs_and_next(iterator, max_previous_size=MAX_WINDOW_ACTIONS):
    previouses = deque()

    item = None
    for next_item in iterator:
        if item:
            yield item, previouses, next_item
            previouses.appendleft(item)
            if max_previous_size and len(previouses) > max_previous_size:
                previouses.pop()
        item = next_item
    if previouses:
        yield next_item, previouses, None


def is_contact_schema(url):
    if url.startswith(b'http'):
        return False
    return any(url.startswith(schema) for schema in CONTACT_SCHEMAS)


def match_id_value(url):

    match_phone = re.match(RE_PHONE, url)
    if match_phone:
        return b'7%s%s%s%s' % match_phone.groups(), b'phone'

    match_email = re.match(RE_EMAIL, url)
    if match_email:
        return match_email.groups()[0], b'email'

    return None, None


def date_range(from_, to_):
    format_ = '%Y-%m-%d'
    from_date = datetime.strptime(from_, format_)
    to_date = datetime.strptime(to_, format_)
    date_diff = (to_date - from_date).days
    for shift in range(0, date_diff + 1):
        date = from_date + timedelta(days=shift)
        yield datetime.strftime(date, format_)


def parse_value(recs):
    keys = [b'url', b'title', b'referer']

    for rec in recs:
        tskv = {}
        for key in keys:
            index = rec['value'].find(b'\t%s=' % key)
            if index > -1:
                index_tab = rec['value'].find('\t', index + 1)
                value = rec['value'][index + len(key) + 2:index_tab if index_tab != -1 else None]
                tskv[key] = value
        yield Action(rec['subkey'], tskv.get('url'), tskv.get('title'), tskv.get('referer'))
