package ru.yandex.wmtools.common.util;

import ru.yandex.wmtools.common.SupportedProtocols;
import sun.security.action.GetPropertyAction;

import java.io.CharArrayWriter;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;
import java.security.AccessController;
import java.util.BitSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * @author avhaliullin
 */
public final class URLUtil {
    private static final URLUtil INSTANCE = new URLUtil();
    private static final Pattern URL_PATTERN = Pattern.compile(
            "^(https?)://" +   //scheme
                    "([^:/]+)" + //host
                    "(:(\\d+))?" + //port
                    "(/([^?#]+)?" + //path
                    "(\\?([^#]+)?)?" + //params
                    "(#(.+)?)?)?$" //anchor
    );

    private static final Pattern HOST_PATTERN = Pattern.compile(
            "^([0-9a-zA-Z]([0-9a-zA-Z-_]*[0-9a-zA-Z])?\\.)+([0-9a-zA-Z]([0-9a-zA-Z-_]*[0-9a-zA-Z])?)$"
    );

    private static final String PCT_ENCODED = "(%[a-fA-F0-9]{2})*";

    // ALPHA / DIGIT / "-" / "." / "_" / "~"
    private static final String UNRESERVED = "[a-zA-Z0-9-._~]";

    // "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
    private static final String SUBDELIMS = "[!$&')(*+,;=]";

    // unreserved / pct-encoded / sub-delims / ":" / "@"
    private static final String PCHAR = "(" + UNRESERVED + "|" + PCT_ENCODED + "|" + SUBDELIMS + "|" +"[:@])";

    // fragment      = *( pchar / "/" / "?" )
    private static final String FRAGMENT = "(" + PCHAR  + "|[/?])*";

    private static final Pattern FRAGMENT_PATTERN = Pattern.compile(FRAGMENT);

    // robot allows some non-standard characters in path part of url
    // - square brackets: '[' | ']'
    private static final String YANDEX_EXTENDED_PATH_CHAR = "[\\]\\[]";

    private static final Pattern PATH_PATTERN = Pattern.compile(
            "^(([-:@&=+,.!/~*'$_;0-9a-zA-Z)(?#]|%[a-zA-Z0-9]{2}|" + YANDEX_EXTENDED_PATH_CHAR + ")*)?$");

    private static final Pattern SPAMER_HOST_PATTERN = Pattern.compile(
            "^(https://)?(.*-.*)\\.(.*)\\.ru\\.com(:.*)?$");

    private URLUtil() {
    }

    private static ParsedUrl parse(String url) {
        Matcher matcher = URL_PATTERN.matcher(url);
        if (!matcher.find()) {
            return null;
        }

        return INSTANCE.new ParsedUrl(matcher.group(1),
                matcher.group(2),
                matcher.group(4),
                matcher.group(5),
                matcher.group(10)
        );
    }

    private static boolean checkHost(String host) {
        boolean matched = host != null && HOST_PATTERN.matcher(host).matches();
        if (!matched) {
            return false;
        }
        // проверяем, что "_" встречается только в третьем уровне и выше
        if (host.contains("_")) {
            final String [] parts = host.split("\\.");
            final int len = parts.length;
            // `Проверяем наличие "_" в предпоследней и последней частях имени хоста
            for (int i = Math.max(len-2, 0); i < parts.length; i++) {
                if (parts[i].contains("_")) {
                    return false;
                }
            }
        }
        return true;
    }

    private static boolean checkPort(String port) {
        return port == null || port.length() < 6;
    }

    private static boolean checkPath(String path) {
        return path == null || PATH_PATTERN.matcher(path).matches();
    }

    private static boolean checkFragment(String fragment) {
        return fragment == null || FRAGMENT_PATTERN.matcher(fragment).matches();
    }

    public static boolean isURLValid(String url) {
        ParsedUrl parsedUrl = parse(url);
        return parsedUrl != null &&
                checkHost(parsedUrl.host) &&
                checkPath(parsedUrl.path) &&
                checkPort(parsedUrl.port) &&
                checkFragment(parsedUrl.fragment);
    }

    private class ParsedUrl {
        private final String scheme;
        private final String host;
        private final String port;
        private final String path;
        private final String fragment;

        private ParsedUrl(String scheme, String host, String port, String path, String fragment) {
            this.scheme = scheme;
            this.host = host;
            this.port = port;
            this.path = path;
            this.fragment = fragment;
        }
    }
    static BitSet dontNeedEncoding;
    static final int caseDiff = ('a' - 'A');
    static String dfltEncName = null;

    static {
        dontNeedEncoding = new BitSet(256);
        int i;
        for (i = 'a'; i <= 'z'; i++) {
            dontNeedEncoding.set(i);
        }
        for (i = 'A'; i <= 'Z'; i++) {
            dontNeedEncoding.set(i);
        }
        for (i = '0'; i <= '9'; i++) {
            dontNeedEncoding.set(i);
        }
        dontNeedEncoding.set(' '); /* encoding a space to a + is done
                                    * in the encode() method */
        dontNeedEncoding.set('-');
        dontNeedEncoding.set('_');
        dontNeedEncoding.set('.');
        dontNeedEncoding.set('*');

        dfltEncName = AccessController.doPrivileged(
                new GetPropertyAction("file.encoding")
        );
    }

    public static int getEncodedURLSize(final String data, final String encoding) throws UnsupportedEncodingException {
        boolean needToChange = false;
        Charset charset;
        CharArrayWriter charArrayWriter = new CharArrayWriter();

        if (encoding == null)
            throw new NullPointerException("charsetName");

        try {
            charset = Charset.forName(encoding);
        } catch (IllegalCharsetNameException e) {
            throw new UnsupportedEncodingException(encoding);
        } catch (UnsupportedCharsetException e) {
            throw new UnsupportedEncodingException(encoding);
        }

        int encodedSize = 0;

        for (int i = 0; i < data.length();) {
            int c = (int) data.charAt(i);
            if (dontNeedEncoding.get(c)) {
                if (c == ' ') {
                    c = '+';
                    needToChange = true;
                }
                i++;
                encodedSize++;
            } else {
                // convert to external encoding before hex conversion
                do {
                    charArrayWriter.write(c);
                    /*
                     * If this character represents the start of a Unicode
                     * surrogate pair, then pass in two characters. It's not
                     * clear what should be done if a bytes reserved in the
                     * surrogate pairs range occurs outside of a legal
                     * surrogate pair. For now, just treat it as if it were
                     * any other character.
                     */
                    if (c >= 0xD800 && c <= 0xDBFF) {
                        /*
                          System.out.println(Integer.toHexString(c)
                          + " is high surrogate");
                        */
                        if ( (i+1) < data.length()) {
                            int d = (int) data.charAt(i+1);
                            /*
                              System.out.println("\tExamining "
                              + Integer.toHexString(d));
                            */
                            if (d >= 0xDC00 && d <= 0xDFFF) {
                                /*
                                  System.out.println("\t"
                                  + Integer.toHexString(d)
                                  + " is low surrogate");
                                */
                                charArrayWriter.write(d);
                                i++;
                            }
                        }
                    }
                    i++;
                } while (i < data.length() && !dontNeedEncoding.get((c = (int) data.charAt(i))));

                charArrayWriter.flush();
                String str = new String(charArrayWriter.toCharArray());
                byte[] ba = str.getBytes(charset);
                // each byte represented with '%' and two hex digits
                encodedSize += ba.length * 3;
                charArrayWriter.reset();
                needToChange = true;
            }
        }

        return encodedSize;
    }

    /**
     * Проверка, является ли URL главной страницей.
     * <p>
     * Главной страницей считается URL, состоящий из имени хоста, возможно с символом '/' в конце.
     * </p>
     *
     * @param url   проверяемый url
     * @return      является ли url главной страницей
     */
    public static boolean isHomePage(final URL url) {
       return (url.getPath() == null || url.getPath().isEmpty() || url.getPath().equals("/")) &&
              (url.getFile() == null || url.getFile().isEmpty() || url.getFile().equals("/")) &&
              (url.getQuery() == null) &&
              (url.getRef() == null);
    }

    public static String getHostName(URL url, boolean showDefaultProtocol) {
        StringBuilder builder = new StringBuilder();
        if (showDefaultProtocol || SupportedProtocols.HTTPS.getScheme().equals(url.getProtocol())) {
            builder.append(url.getProtocol());
            builder.append(SupportedProtocols.SCHEME_DELIMITER);
        }
        builder.append(url.getAuthority());
        return builder.toString();
    }

    /**
     * Получить относительнный url
     *
     * Например, для http://lenta.ru:8080/path/to/file?arg1=42&arg2=24#22 вернет
     * /path/to/file?arg1=42&arg2=24#22
     *
     * @param url   объект url
     * @return      строка относительного пути
     */
    public static String getRelativeUrl(URL url) {
        int len = 0;
        if (url.getPath() != null) {
            len += url.getPath().length();
        }
        if (url.getQuery() != null) {
            len += 1 + url.getQuery().length();
        }
        if (url.getRef() != null)
            len += 1 + url.getRef().length();

        final StringBuffer result = new StringBuffer(len);
        if (url.getPath() != null) {
            result.append(url.getPath());
        }
        if (url.getQuery() != null) {
            result.append('?');
            result.append(url.getQuery());
        }
        if (url.getRef() != null) {
            result.append("#");
            result.append(url.getRef());
        }
        return result.toString();
    }

    public static boolean isSpamerDomain(String hostName) {
        return SPAMER_HOST_PATTERN.matcher(hostName).matches();
    }

    public static String getUpperLevelDomain(final String hostName) {
        String [] segments = hostName.split("\\.");
        if (segments.length == 1) {
            return null;
        }
        StringBuilder builder = new StringBuilder();
        String separator = "";
        for (int i = 1; i < segments.length; i++) {
            builder.append(separator);
            builder.append(segments[i]);
            separator = ".";
        }
        return builder.toString();
    }
}
