"""
This file contain regular expressions.
"""

import re

# Basic math regexps
DEFAULT_CLEAN_SYMBOLS = r"\n\r\t"
HEX_NUMBER_SYMBOLS = r"0-9a-fA-F"

RE_NOT_DIGITS = re.compile(r"[^\d]")
RE_NOT_HEX_LOWERCASE_DIGITS = re.compile(r"[^0-9a-f]")

RE_HASH = re.compile(r"[0-9a-fA-F]{32}")

RE_ANY_ID = re.compile(r"[0-9a-zA-Z]{1,64}")

RE_TIMESTAMP = re.compile(r"[0-9]{10}")

RE_MAC_ADDRESS_COLON = re.compile(r"(:?[0-9A-Fa-f]{2}[:]){5}([0-9A-Fa-f]{2})")  # recommended!
RE_MAC_ADDRESS_DOT = re.compile(r"(:?[0-9A-Fa-f]{4}\.[0-9A-Fa-f]{4}\.[0-9A-Fa-f]{4})")  # recommended!
RE_MAC_ADDRESS_HYPHEN = re.compile(r"(:?[0-9A-Fa-f]{2}[-]){5}([0-9A-Fa-f]{2})")
RE_MAC_ADDRESS_NUMBER = re.compile(r"(:?[0-9A-Fa-f]{12})")
RE_MAC_ADDRESS_ALL = re.compile(
    r"("
    + (
        "|".join(
            [
                RE_MAC_ADDRESS_COLON.pattern,
                RE_MAC_ADDRESS_DOT.pattern,
                RE_MAC_ADDRESS_HYPHEN.pattern,
                RE_MAC_ADDRESS_NUMBER.pattern,
            ]
        )
    )
    + ")"
)

RE_YUID = re.compile(r"[0-9]{1,20}")

RE_UUID = re.compile(r"[0-9a-fA-F]{32}")

RE_AGE = re.compile(r"[0-9]{1,3}")

RE_IPV4 = re.compile(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}")
RE_EMAIL = re.compile(
    r"("
    r"(?:"
    r"(?:"
    r"(?:[a-zA-Z0-9_'`~%+-]+\.)*"
    r"[a-zA-Z0-9_'`~%+-]+"
    r")"
    r"|"
    r"(?:[a-zA-Z0-9_'`~%+-]+)"
    r")"
    r"@"
    r"(?:"
    r"(?:"
    r"(?:"
    r"(?=[a-z0-9-]{1,63}\.)"
    r"(?:xn--)?[a-z0-9]+"
    r"(?:-[a-z0-9]+)*\.)+[a-z]{2,63}"
    r")"
    r"|"
    r"(?:\[?" + RE_IPV4.pattern + r"\]?)"
    r")"
    r")"
)

# String regexps with start/end termination
RE_NOT_DIGITS_STRING = re.compile(r"^" + RE_NOT_DIGITS.pattern + "$")
RE_HASH_STRING = re.compile(r"^" + RE_HASH.pattern + "$")
RE_ANY_ID_STRING = re.compile(r"^" + RE_ANY_ID.pattern + "$")

RE_TIMESTAMP_STRING = re.compile(r"^" + RE_TIMESTAMP.pattern + "$")

RE_MAC_ADDRESS_COLON_STRING = re.compile(r"^" + RE_MAC_ADDRESS_COLON.pattern + "$")
RE_MAC_ADDRESS_HYPHEN_STRING = re.compile(r"^" + RE_MAC_ADDRESS_HYPHEN.pattern + "$")
RE_MAC_ADDRESS_DOT_STRING = re.compile(r"^" + RE_MAC_ADDRESS_DOT.pattern + "$")
RE_MAC_ADDRESS_NUMBER_STRING = re.compile(r"^" + RE_MAC_ADDRESS_NUMBER.pattern + "$")
RE_MAC_ADDRESS_ALL_STRING = re.compile(r"^" + RE_MAC_ADDRESS_ALL.pattern + "$")

RE_YUID_STRING = re.compile(r"^" + RE_YUID.pattern + "$")
RE_UUID_STRING = re.compile(r"^" + RE_UUID.pattern + "$")

RE_AGE_STRING = re.compile(r"^" + RE_AGE.pattern + "$")

RE_EMAIL_STRING = re.compile(
    r"^(?:.*[^a-zA-Z0-9_'`~%+\-.@])?" + RE_EMAIL.pattern + r"(?:[^a-zA-Z0-9\]].*)?$"
)  # email string could include something after email
RE_IPV4_STRING = re.compile(r"^" + RE_IPV4.pattern + "$")

SPLITTER_SYMBOLS = [" ", "/", "\t", "\r", "\n", "%3a", ":", "%", "=", ";", "(", ")", ","]
REMOVE_SYMBOLS = ["&lt;", "&nbsp;", "%"]
BAD_SYMBOLS = SPLITTER_SYMBOLS + REMOVE_SYMBOLS
YANDEX_DOMAIN = "yandex.ru"
AT = "@"

# For mail.ru for example bk.ru and inbox.ru is different emails
DOMAINS_MAP = {
    # yandex
    "ya.ru": "yandex.ru",
    # google
    "googlemail.com": "gmail.com",
}


def remove_custom_garbage_from_email(orig_email):
    if any(s in orig_email for s in BAD_SYMBOLS):
        email = orig_email.strip()

        for remove_symbol in REMOVE_SYMBOLS:
            email = email.replace(remove_symbol, "")

        for splitter in SPLITTER_SYMBOLS:
            if email.find(splitter) != -1:
                for part in email.split(splitter):
                    if part.find(AT) != -1:
                        email = part  # keep only one part

        if RE_EMAIL_STRING.match(email):
            return email
        return None
    return orig_email


def norm_login(login):
    return login.strip().replace(".", "-").lower()


def norm_email(email):
    if not email:
        return None

    email = email.strip()
    email = remove_custom_garbage_from_email(email)
    if not email:
        return None

    parts = email.split(AT)
    if len(parts) > 1:
        login, domain = parts[0], parts[1]
        domain = domain.lower()

        # normalize emails like test@[123.32.11.5] -> test@123.32.11.5
        domain = re.sub(r"\[|\]", "", domain)
        domain = DOMAINS_MAP.get(domain, domain)
        if domain == YANDEX_DOMAIN:
            login = norm_login(login)
        else:
            login = login.lower()

        email = login + AT + domain

    return email
