package ru.yandex.wmconsole.servantlet.robotstxt;

import java.net.URL;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;

import org.apache.commons.lang.StringUtils;

import ru.yandex.common.util.xml.XmlConvertable;
import ru.yandex.wmconsole.data.info.AcceptedLineInfo;
import ru.yandex.wmconsole.data.info.AllowInfo;
import ru.yandex.wmconsole.data.info.ErrorInfo;
import ru.yandex.wmconsole.data.info.ParseErrorInfo;
import ru.yandex.wmconsole.data.info.UrlAllowInfo;
import ru.yandex.wmconsole.data.wrappers.AcceptedLineInfoWrapper;
import ru.yandex.wmconsole.data.wrappers.ParseErrorInfoWrapper;
import ru.yandex.wmconsole.data.wrappers.UrlAllowInfoWrapper;
import ru.yandex.wmconsole.util.WwwUtil;
import ru.yandex.wmtools.common.Constants;
import ru.yandex.wmtools.common.error.UserException;
import ru.yandex.wmtools.common.servantlet.AbstractServantlet;
import ru.yandex.wmtools.common.util.URLUtil;
import ru.yandex.wmtools.common.util.XmlConvertableCollectionWrapper;

/**
 * Common utilities for different robots.txt analyzers.
 *
 * @author ailyin
 */
public class RobotsTxtHelper implements Constants {
    public static final String PARAM_ROBOTSTXT = "robotstxt";
    public static final String PARAM_URLS = "urls";
    public static final String PARAM_ONLY_LOAD = "only_load";

    public static final int MAX_URLS_COUNT = 100;
    private static final int MAX_URL_LENGTH = 1024;
    public static final int MAX_ROBOTS_TXT_SIZE = 1024 * 32;

    private static boolean isRelativeUrl(String url) {
        return url.startsWith("/");
    }

    /**
     * Создаёт объект URL по строке и кодирует кириллические URL в punycode.
     * Этот метод отличается от AbstractServantlet.prepareUrl тем, что не делает принудительного кодирования URL
     * в случае, если он не валиден по нашим данным. см. WMCON-4978
     *
     * @param url
     * @return
     */
    private static URL urlFromString(URL hostName, String url) throws UserException {
        String u = url;
        if (isRelativeUrl(u)) {
            u = hostName.getProtocol() + SCHEME_DELIMITER + hostName.getAuthority() + u;
        } else if (!u.contains(SCHEME_DELIMITER)) {
            u = hostName.getProtocol() + SCHEME_DELIMITER + u;
        }

        return AbstractServantlet.doPrepareUrl(u, true, false, false);
    }

    public static List<String> getValidUrls(URL hostname, String[] urls, List<UrlAllowInfo> allowInfos) {
        List<String> result = new LinkedList<String>();

        // add corresponding allowInfos for invalid urls and add valid urls to result list
        for (int i = 0; i < urls.length; i++) {
            if (urls[i].length() > MAX_URL_LENGTH) {
                allowInfos.set(i, new UrlAllowInfo(urls[i], "ROBOTSTXT_URL_TOO_LONG"));
                continue;
            }

            if (hostname == null) {
                if (!isRelativeUrl(urls[i])) {
                    allowInfos.set(i, new UrlAllowInfo(urls[i], "NO_HOSTNAME"));
                    continue;
                }
                result.add(urls[i]);
            } else {
                URL url;
                try {
                    url = urlFromString(hostname, urls[i]);
                } catch (UserException e) {
                    allowInfos.set(i, new UrlAllowInfo(urls[i], "URL_SYNTAX_ERROR"));
                    continue;
                }

                String fullHostName = URLUtil.getHostName(hostname, true);
                String fullUrlHostName = URLUtil.getHostName(url, true);
                boolean equalsIgnoreWww = WwwUtil.equalsIgnoreWww(
                        fullHostName,
                        fullUrlHostName);
                boolean equalsExact = fullHostName.equalsIgnoreCase(fullUrlHostName);
                if (!equalsIgnoreWww) {
                    allowInfos.set(i, new UrlAllowInfo(urls[i], "WRONG_DOMAIN"));
                    continue;
                } else if (!isRelativeUrl(urls[i]) && !equalsExact) {
                    urls[i] = WwwUtil.switchWWW(urls[i]);
                }

                /*
                 * robots.txt parser assumes '/' always follows hostname.
                 * So we add '/' at the end when url path is empty.
                 */
                result.add(StringUtils.isEmpty(url.getFile()) ? "/" : url.getFile());
            }
        }

        return result;
    }

    public static String[] splitRobotsTxt(String str) {
        /*
        * Replaces different types of line terminators ('\r', '\n', "\r\n") with
        * unix-style line terminator ('\n').
        */
        return str.replace("\r\n", "\n").replace("\r", "\n").split("\n");
    }

    public static XmlConvertable getErrorsData(List<ErrorInfo> errorInfos, String[] robotsTxtLines) {
        LinkedList<ParseErrorInfo> parseErrorInfos = new LinkedList<ParseErrorInfo>();
        for (ErrorInfo errorInfo : errorInfos) {
            parseErrorInfos.add(new ParseErrorInfo(robotsTxtLines[(int) errorInfo.getLineNumber() - 1], errorInfo));
        }
        return XmlConvertableCollectionWrapper.wrap(parseErrorInfos, ParseErrorInfoWrapper.class, "parse-errors");
    }

    public static XmlConvertable getAcceptedLinesData(List<Long> acceptedLines, String[] robotsTxtLines) {
        List<XmlConvertable> sections = new ArrayList<XmlConvertable>();

        List<AcceptedLineInfoWrapper> lines = null;
        for (int i = 0; i < acceptedLines.size(); i++) {
            if ((i == 0) || (acceptedLines.get(i) != acceptedLines.get(i - 1) + 1)) {//new section
                addSectionIfNeeded(lines, sections);
                lines = new ArrayList<AcceptedLineInfoWrapper>();
            }
            lines.add(new AcceptedLineInfoWrapper(
                    new AcceptedLineInfo(acceptedLines.get(i).intValue(), robotsTxtLines[acceptedLines.get(i).intValue() - 1])));
        }
        addSectionIfNeeded(lines, sections);

        return new XmlConvertableCollectionWrapper(sections, "accepted-lines");
    }

    private static void addSectionIfNeeded(List<AcceptedLineInfoWrapper> lines, List<XmlConvertable> sections) {
        if (lines != null) {// previous section exists
            sections.add(new XmlConvertableCollectionWrapper(lines, "section"));
        }
    }

    public static XmlConvertable getAllowData(List<AllowInfo> allowInfos, List<UrlAllowInfo> urlAllowInfos, String[] urls) {
        int j = 0;
        for (AllowInfo info : allowInfos) {
            while (urlAllowInfos.get(j) != null) {
                // if urlAllowInfos.get(j)!= null it corresponds to an invalid url
                j++;
            }

            if (info.isSyntaxError()) {
                urlAllowInfos.set(j, new UrlAllowInfo(urls[j], "URL_SYNTAX_ERROR"));
            } else {
                urlAllowInfos.set(j, new UrlAllowInfo(urls[j], info));
            }
            j++;
        }
        return XmlConvertableCollectionWrapper.wrap(urlAllowInfos, UrlAllowInfoWrapper.class, "are-allowed");
    }

    public static List<ErrorInfo> addToRightPosition(List<ErrorInfo> errorInfos, ErrorInfo errorInfo) {
        List<ErrorInfo> res = new ArrayList<ErrorInfo>(errorInfos.size() + 1);
        boolean isInserted = false;
        for (ErrorInfo cur : errorInfos) {
            if (!isInserted && cur.getLineNumber() > errorInfo.getLineNumber()) {
                res.add(errorInfo);
                isInserted = true;
            }
            res.add(cur);
        }
        if (!isInserted) {
            res.add(errorInfo);
        }
        return res;
    }

    public static int countOccurences(String s, char c) {
        int res = 0;
        for (char cur : s.toCharArray()) {
            if (cur == c) {
                res++;
            }
        }
        return res;
    }
}
