package ru.yandex.webmaster3.core.semantic.schema_org_information_extractor;

import org.htmlcleaner.CleanerProperties;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.PrettyHtmlSerializer;
import org.htmlcleaner.TagNode;

import java.util.LinkedList;
import java.util.List;

/**
 * Created by IntelliJ IDEA.
 * User: rasifiel
 * Date: 3/26/12
 * Time: 11:09 AM
 */
public class Parser {

    private final HtmlCleaner cleaner;
    private final String content;

    public Parser(final String content) {
        this.content = content;
        final CleanerProperties properties = new CleanerProperties();
        properties.setNamespacesAware(false);
        cleaner = new HtmlCleaner(properties);
    }

    public List<RDFsEntity> getEntities() {
        TagNode root = cleaner.clean(content);
        PrettyHtmlSerializer serializer = new PrettyHtmlSerializer(cleaner.getProperties());
        final List<RDFsEntity> result = new LinkedList<RDFsEntity>();
        extract(root, result);
        return result;
    }

    private void extract(final TagNode root, final List<RDFsEntity> result) {
        final String typeOf = root.getAttributeByName("typeof");
        String about = root.getAttributeByName("resource");
        if(about == null && typeOf != null){
            about = root.getAttributeByName("about");
        }
        if (typeOf != null && !typeOf.isEmpty() && about != null && !about.isEmpty()) {
            result.add(getEntityFrom(root, typeOf, about));
            for (final TagNode child : root.getChildTags()) {
                extract(child, result);
                if(child.hasAttribute("rel")){
                    if(child.getAttributeByName("rel").startsWith("owl")){
                        if(child.getAttributeByName("rel").startsWith("owl:equivalent")){
                            for(final TagNode subChild: child.getChildTags()){
                                RDFsEntity eqEntity = getEntityFrom(subChild,
                                        subChild.getAttributeByName("typeof"), subChild.getAttributeByName("about"));
                                result.add(eqEntity);
                            }
                        }
                    }
                }
            }
        }
        else {
            for (final TagNode child : root.getChildTags()) {
                extract(child, result);
            }
        }
    }

    private RDFsEntity getEntityFrom(final TagNode root, final String typeOf, final String about) {
        final RDFsEntity entity = new RDFsEntity(typeOf, about);
        traverseAndExtract(root, entity);
        return entity;
    }

    private void traverseAndExtract(final TagNode root, final RDFsEntity entity) {
        final String rel = root.getAttributeByName("rel");
        if (rel != null && rel.startsWith("rdfs")){

            if(rel.equals("rdfs:range")){
                if(root.hasAttribute("resource")){
                    entity.addProperty(rel, root.getAttributeByName("resource"));
                } else {
                    for(final TagNode child : root.getChildTags()){
                        if(child.hasAttribute("about")){
                            entity.addProperty(rel, child.getAttributeByName("about"));
                        }
                        if(child.hasAttribute("typeof")){
                            for(final TagNode subChild :child.getChildTags()){
                                if(subChild.hasAttribute("rel")){
                                    if(subChild.getAttributeByName("rel").equals("owl:unionOf")){
                                        entity.addProperty(rel, subChild.getAttributeByName("resource"));
                                    }
                                }
                            }
                        }
                    }
                }
            }
            else if(rel.equals("rdfs:domain")){
                if(root.hasAttribute("resource")){
                    entity.addProperty(rel, root.getAttributeByName("resource"));
                } else {
                    for(final TagNode child : root.getChildTags()){
                        if(child.hasAttribute("about")){
                            entity.addProperty(rel, child.getAttributeByName("about"));
                        }
                        if(child.hasAttribute("typeof")){
                            for(final TagNode subChild :child.getChildTags()){
                                if(subChild.hasAttribute("rel")){
                                    if(subChild.getAttributeByName("rel").equals("owl:unionOf")){
                                        entity.addProperty(rel, subChild.getAttributeByName("resource"));
                                    }
                                }
                            }
                        }
                    }
                }

            }
            else if(rel.startsWith("rdfs:sub")){
                if(root.hasAttribute("resource")){
                    entity.addProperty(rel, root.getAttributeByName("resource"));
                }
                else {
                    for(final TagNode child : root.getChildTags()){
                        if(child.hasAttribute("about")){
                            entity.addProperty(rel, child.getAttributeByName("about"));
                        }
                    }
                }
            }
            else if(root.hasAttribute("resource")){
                entity.addProperty(rel, root.getAttributeByName("resource"));
            } else {
                entity.addProperty(rel, root.getAttributeByName("href"));
            }

        }
        if (rel != null && rel.startsWith("owl")){
                if(rel.equals("owl:equivalentProperty")){
                for(final TagNode child: root.getChildTags()){
                    for(final TagNode subChild: child.getChildTags()){
                        traverseAndExtract(subChild, entity);
                        //traverseAndExtract(subChild, new RDFsEntity(subChild.getAttributeByName("typeof"), subChild.getAttributeByName("about")));
                    }
                }
            }
        }
        final String property = root.getAttributeByName("property");
        if (property != null && !property.isEmpty()) {
            if(root.hasAttribute("content")){
                entity.addProperty(property, root.getAttributeByName("content"));
            }
            else{
                entity.addProperty(property, extractTextOrHref(root));
            }
        } else {
            for (final TagNode child : root.getChildTags()) {
                if(!child.hasAttribute("typeof")){
                    traverseAndExtract(child, entity);
                }
            }
        }

    }

    private String extractTextOrHref(final TagNode root) {
        final String href = root.getAttributeByName("href");
        final String src = root.getAttributeByName("src");
        if (href != null && !href.isEmpty()) {
            return href;
        }
        if (src != null && !src.isEmpty()) {
            return src;
        }
        return extractText(root);
    }

    private String extractText(final TagNode root) {
        return root.getText().toString();
    }

}
