package ru.yandex.webmaster3.core.semantic.semantic_document_parser.rta;

import org.htmlcleaner.*;
import ru.yandex.common.util.collections.Cf;
import ru.yandex.common.util.xml.Xmler;

import java.util.Collections;
import java.util.List;

import static ru.yandex.common.util.xml.Xmler.tag;

/**
 * Created by IntelliJ IDEA.
 * User: rasifiel
 * Date: 3/21/12
 * Time: 2:25 AM
 */
public class RTAExtractor {

    static class HTML5TagProvider extends DefaultTagProvider {
        @Override
        public TagInfo getTagInfo(final String tagName) {
            final TagInfo temp = super.getTagInfo(tagName);
            if ("meta".equals(tagName) || "link".equals(tagName)) {
                temp.setBelongsTo(BelongsTo.HEAD_AND_BODY);
            }
            return temp;
        }
    }

    static HtmlCleaner cleaner = new HtmlCleaner(new HTML5TagProvider());

    static {
        final CleanerProperties props = cleaner.getProperties();
        props.setOmitXmlDeclaration(true);
        props.setNamespacesAware(false);
        props.setPruneTags("script,style");
    }

    public static List<Xmler.Tag> extractRTA(final String content, final String sourceUrl) {
        final TagNode root = cleaner.clean(content);
        final TagNode head = root.findElementByName("head", true);
        if (head != null) {
            for (final TagNode node : head.getChildTags()) {
                if ("meta".equals(node.getName()) && "rating".equalsIgnoreCase(node.getAttributeByName("name")) &&
                        "RTA-5042-1996-1400-1577-RTA".equalsIgnoreCase(node.getAttributeByName("content"))) {
                    return Cf.list(tag("rta", tag("rating", tag("v", "RTA-5042-1996-1400-1577-RTA"))));
                }
            }
        }
        return Collections.emptyList();
    }
}
