package ru.yandex.webmaster3.core.robotstxt;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.StringReader;

/**
 * Created by Oleg Bazdyrev on 27/07/2018.
 */
public class RobotsTxtUtils {

    private static final String RULE_USER_AGENT = "user-agent:";
    private static final String ROBOT_ALL = "*";
    private static final String ROBOT_YANDEX = "yandex";
    private static final String RULE_ALLOW = "allow:";
    private static final String RULE_DISALLOW = "disallow:";

    public static AllowPrefixInfo isPrefixAllowed(String prefix, String robotsTxt) throws IOException {
        BufferedReader reader = new BufferedReader(new StringReader(robotsTxt));
        String line;
        String robot = null;
        int lineNumber = 0;
        String rule = "";
        int ruleLength = 0;
        boolean ruleAllow = true;
        int ruleLineNumber = -1;
        while ((line = reader.readLine()) != null) {
            line = line.strip();
            lineNumber++;
            // читаем, пока не найдем нашего робота
            if (line.toLowerCase().startsWith(RULE_USER_AGENT)) {
                if (ROBOT_YANDEX.equals(robot)) {
                    break; // лучше уже ничего не найдем
                }
                robot = line.substring(RULE_USER_AGENT.length()).trim().toLowerCase();
                /*if (robot.startsWith(ROBOT_YANDEX)) {
                    robot = ROBOT_YANDEX;
                }*/
                if (!ROBOT_ALL.equals(robot) && !ROBOT_YANDEX.equals(robot)) {
                    robot = null; // остальные роботы неинтересны
                } else {
                    rule = "";
                    ruleAllow = true;
                    ruleLineNumber = -1;
                    ruleLength = 0;
                }
            }
            if (robot == null) {
                continue; // строки без робота неинтересны
            }
            boolean currentAllow;
            String currentPrefix;
            if (line.toLowerCase().startsWith(RULE_ALLOW)) {
                currentAllow = true;
                currentPrefix = line.substring(RULE_ALLOW.length()).trim();
            } else if (line.toLowerCase().startsWith(RULE_DISALLOW)) {
                currentAllow = false;
                currentPrefix = line.substring(RULE_DISALLOW.length()).trim();
            } else {
                continue;
            }
            String currentRule = currentPrefix;
            int starIndex = currentPrefix.indexOf("*");
            // для Allow проверяем по самому жесткому варианту (с учетом звездочек)
            if (starIndex >= 0 && currentAllow) {
                currentRule = currentPrefix.substring(0, starIndex);
            }
            // prefix to regex
            if (prefix.startsWith(currentRule.replaceAll("[$*]", ""))) {
                int prefixLength = currentPrefix.replaceAll("[$*]", "").length();
                // если новое правило длинее старого - юзаем его
                if (prefixLength > ruleLength || (prefixLength == ruleLength && currentAllow)) {
                    ruleAllow = currentAllow;
                    rule = currentPrefix;
                    ruleLineNumber = lineNumber;
                    ruleLength = prefixLength;
                }
            }
        }

        return new AllowPrefixInfo(ruleAllow, rule, ruleLineNumber);
    }

}
