#include "urls.h"
#include <algorithm>

//parsed_params.push_back("product_id");
//parsed_params.push_back("offerid");
//parsed_params.push_back("oid");
//parsed_params.push_back("size");
//parsed_params.push_back("offer");
//parsed_params.push_back("offerid");


namespace {

TString CutMobileAndWww(const TString& url) {
	if (url.substr(0, 2) == "m.")
		return url.substr(2);
	else if (url.substr(0, 4) == "www.")
		return url.substr(4);

	return url;
}

TString CutScheme(const TString& url) {
	if (url.substr(0, 7) == "http://") {
		return url.substr(7); 
	}
	else if (url.substr(0, 8) == "https://") {
		return url.substr(8);
	}

	return url;
}

TString CutSchemeWwwMobile(const TString& url) {
	return CutMobileAndWww(CutMobileAndWww(CutScheme(url))); //fix www.m. and m.www. cases
}

TString DecodeQuestionAndEq(const TString& url) {
	auto new_url = url;
	while ((int)new_url.find("%2f") != -1)
		new_url.replace(new_url.find("%2f"),3,"/");
	while ((int)new_url.find("%3f") != -1)
		new_url.replace(new_url.find("%3f"),3,"?");
	while ((int)new_url.find("%3d") != -1)
		new_url.replace(new_url.find("%3d"),3,"&");
	while ((int)new_url.find("%26") != -1)
		new_url.replace(new_url.find("%26"),3,"=");
	if ((int)new_url.find("ozon.ru") == -1){
		if ((int)new_url.find("?") == -1 && (int)new_url.find("&sa") >0 )
			new_url.replace(new_url.find("&sa"),3,"?sa");
		else if ((int)new_url.find("/&") >0 )
			new_url.replace(new_url.find("/&"),2,"/?");
	}
	return new_url;
}

TString CutLastSlash(const TString& url) {
	if (url.back() == '/') 
		return url.substr(0, url.size() - 1);
	else
		return url;
}

//for unschemed url
TString GetDomain(const TString& url) {
	return url.substr(0, url.find("/"));
} 

TString CutCGIParams(const TString& url) {
	return url.substr(0, url.find("?"));
}

TString CanonizeOzonUrl(const TString& url) {
	const auto pathStart = url.find("/");
	if (pathStart == TString::npos) {
	    return url;
	}

    const auto path = url.substr(pathStart);
    TVector<TString> validPrefixes {"/context/detail/id/", "/product/"};
    for (const auto& prefix : validPrefixes) {
        if (path.substr(0, prefix.size()) != prefix) {
            continue;
        }
        auto pos = path.find('/', prefix.size());
        auto endPos = (pos == TString::npos ? path.size() : pos);
		auto cgi = path.find('?');
		if (cgi != TString::npos){
			endPos = cgi;
		}
		for (unsigned i = endPos; i > 0; i--){
			if (path[i] == '/' && IsDigit(path[i-1])){
				endPos = i;
				break;
			}
		}
		auto beginPos = endPos;
	    while (IsDigit(path[beginPos - 1])) {
            beginPos--;
        }
		if ((endPos - beginPos) < 4) beginPos = endPos;
        return TStringBuilder() << "ozon.ru/context/detail/id/" << path.substr(beginPos, endPos - beginPos);
    }
    return url;
}

TString CanonizeOzonOneLinkUrl(const TString& url, TString& domain){
	domain = "ozon.ru";
	return CanonizeOzonUrl(url.substr(url.rfind("//") + 2));
}

TString CanonizeWBUrl(TString url) {
	if (url.EndsWith("detail.aspx/detail.aspx")) {
		url = url.substr(0, url.size() - 12);
	}

	TString tail = "catalog/";
	auto pos = url.find(tail);
	if (pos == TString::npos)
		return url;
	pos += tail.size();

	while (IsDigit(url[pos])) {
		pos += 1;
	}

	return url.substr(0, pos) + "/detail.aspx";
}

TString LowerAndPunycodeUrl(TString& url, TString& domain) {
	auto domain_end = url.find('/');
	if ((int)domain_end == -1)
		domain_end = url.length();
    char * converted_domain;
    auto check = idn2_to_ascii_8z(domain.c_str(), &converted_domain, 0);
	TString domain_new(converted_domain);
	if (domain_new.length() == 0 || check != IDN2_OK)
		domain_new = domain;
	url =  domain_new + url.substr(domain_end);
	if (check == IDN2_OK)
		free(converted_domain);
	for (size_t i=0; i<url.length(); i++)
    	url[i] = tolower(url[i]);
	return url;
}

TString CanonizeDiamondelectricUrl(const TString& url) {
	NUri::TUri uri;

	if (uri.Parse(url, NUri::TFeature::FeaturesRecommended) != NUri::TState::EParsed::ParsedOK) {
		return url;
	}

	TCgiParameters cgiQueryParams(uri.GetField(NUri::TField::FieldQuery));

	if (cgiQueryParams.Has("id"))
		return "diamondelectric.ru/catalog/" + cgiQueryParams.Get("id");
	else 
		return url;
	
	return url;	
}

TString CanonizeAliexpressUrl(const TString& url) {
	NUri::TUri uri;

	if (uri.Parse(url, NUri::TFeature::FeaturesRecommended) != NUri::TState::EParsed::ParsedOK) {
		return url;
	}       

	TCgiParameters cgiQueryParams(uri.GetField(NUri::TField::FieldQuery));

	if (cgiQueryParams.Has("item_id")){
		return "aliexpress.ru/item/" + cgiQueryParams.Get("item_id") + ".html";}
	else {
		int cgi = url.find('?');
		if (cgi != -1)
			return url.substr(0, cgi);
		return url;}
					        
	return url;
}

TString CanonizeMvideoUrl(const TString& url) {
	TVector<TString> tails {"/reviews", "/specification"};

	for (const auto& tail : tails) {
		if (url.size() >= tail.size() && url.substr(url.size() - tail.size()) == tail) {
			return url.substr(0, url.size() - tail.size());
		}
	}

	return url;
}

TString CanonizeVseinstrumentiUrl(const TString& url) {
	auto pos = url.find("/otzyvy");
	if (pos == TString::npos) 
		return url;
	else 
		return url.substr(0, pos);

	return url;
}

TString CanonizeSvyaznoyUrl(const TString& url) {
	for (auto i = url.size() - 1; i != 0; --i) {
		if (IsDigit(url[i]))
			return url.substr(0, i + 1);
	}

	return url;
}

TString CanonizeSbermegamarketUrl(const TString& url) {
	TVector<TString> tails {"/otzyvy", "/spec"};

        for (const auto& tail : tails) {
                if (url.substr(url.size() - tail.size()) == tail)
                        return url.substr(0, url.size() - tail.size());
        }

        return url;
}

TString CanonizeCitilinkUrl(const TString& url) {
	TString canonizedUrl = url;
	TString query;

	const auto queryStart = url.find('?');

	if (queryStart != TString::npos) {
		canonizedUrl = url.substr(0, queryStart);
		query = url.substr(queryStart);
	}

	int lastDigitPos = canonizedUrl.size() - 1;
	for (; lastDigitPos != 0; --lastDigitPos) {
		if (IsDigit(canonizedUrl[lastDigitPos]))
			break;
	}

	canonizedUrl = canonizedUrl.substr(0, lastDigitPos + 1) + query;
	
	return (canonizedUrl.find("citilink.ru/product") == TString::npos) ? url : canonizedUrl;
}

TString CanonizeSidexUrl(const TString& url) {
	NUri::TUri uri;

	if (uri.Parse(url, NUri::TFeature::FeaturesRecommended) != NUri::TState::EParsed::ParsedOK) {
		return url;
	}

	TCgiParameters cgiQueryParams(uri.GetField(NUri::TField::FieldQuery));
	
	TString prefix = "sidex.ru/accessories";
	if (url.substr(0, prefix.size()) == prefix) {
		if (cgiQueryParams.Has("item"))
                	return "sidex.ru/view.php?id=" + cgiQueryParams.Get("item");
        	else
                	return url;	
	}

        if (cgiQueryParams.Has("id"))
                return "sidex.ru/view.php?id=" + cgiQueryParams.Get("id");
        else
                return url;

        return url;
}

//url must be unschemed first
TString CutRegionFromUrl(const TString& url) {
	auto beginPos = url.find("/");
	if (beginPos == TString::npos) {
		return url;
	}

	auto endPos = url.substr(beginPos + 1).find("/");
	if (endPos == TString::npos) {
		return url;
	}

	endPos += beginPos + 1;
	auto prefix = url.substr(beginPos, endPos - beginPos);
	if (prefix == "/products" or prefix == "/catalog" or prefix == "/zapchasti")
		return url;
	else 
		return url.substr(0, beginPos) + url.substr(endPos);

	return url;
}

TVector<TString> GetParamsFromString(const TString& params) {
	TVector<TString> parsed_params;
	if (params.size() == 0)
		return parsed_params;

	size_t pos = 0;
	for (size_t it = 0; it < params.size(); ++it) {
		if (params[it] == ' ') {
			parsed_params.push_back(params.substr(pos, it - pos));
			pos = it + 1;
		}
	}
	parsed_params.push_back(params.substr(pos));
	return parsed_params;	
}

bool CheckCgi(const TString& cgiName, const TString& cgiTemp) {
    i64 cgiNameSize = cgiName.size();
    i64 cgiTempSize = cgiTemp.size();
    if (cgiTempSize == 0 || cgiNameSize == 0) {
        return false;
	}

    if (cgiTemp[0] == '*') {
        if (cgiTempSize > 1 && cgiTemp[cgiTempSize - 1] == '*') {
            return cgiName.find(cgiTemp.substr(1, cgiTempSize - 2)) != TString::npos;
        }
        else {
            auto startPos = cgiNameSize - (cgiTempSize - 1);
            return startPos >= 0 && cgiName.substr(startPos) == cgiTemp.substr(1);
        }
    }
    else if (cgiTemp[cgiTempSize - 1] == '*') {
        return cgiName.substr(0, cgiTempSize - 1) == cgiTemp.substr(0, cgiTempSize - 1);
    }
    else {
        return cgiName == cgiTemp;
	}
}

TVector<TString> AddExtraCGI(TVector<TString>& parsed_params){
	parsed_params.push_back("yandex");
	parsed_params.push_back("google");
	parsed_params.push_back("select");
	parsed_params.push_back("cityid");
	parsed_params.push_back("city");
	parsed_params.push_back("city_id");
	parsed_params.push_back("amp;reff");
	parsed_params.push_back("sa");
	parsed_params.push_back("usg");
	parsed_params.push_back("partner");
	parsed_params.push_back("ved");
	parsed_params.push_back("mrkt");
	parsed_params.push_back("mark");
	parsed_params.push_back("seo");
	parsed_params.push_back("nomobile");
	parsed_params.push_back("wt_fm");
	parsed_params.push_back("location");
	parsed_params.push_back("storeid");
	parsed_params.push_back("sda");
	parsed_params.push_back("ad");
	parsed_params.push_back("gbid");
	parsed_params.push_back("block");
	parsed_params.push_back("adult");
	parsed_params.push_back("adjust_tracker");
	parsed_params.push_back("itemOptionId");
	parsed_params.push_back("recommended_code");
	parsed_params.push_back("spm");
	parsed_params.push_back("adjust_creative");
	parsed_params.push_back("recommended_by");
	parsed_params.push_back("adjust_campaign");
	parsed_params.push_back("partnumber");
	parsed_params.push_back("partname");
	parsed_params.push_back("availability");
	parsed_params.push_back("previousPage");
	parsed_params.push_back("category-id");
	parsed_params.push_back("no_overlay");
	parsed_params.push_back("from-show-uid");
	parsed_params.push_back("skuid");
	parsed_params.push_back("sign");
	parsed_params.push_back("parent-reqid");
	parsed_params.push_back("turbo_ic");
	parsed_params.push_back("turbo_uid");
	parsed_params.push_back("pokupki");
	parsed_params.push_back("utm_param1");
	parsed_params.push_back("mclid");
	parsed_params.push_back("shopId");
	parsed_params.push_back("context");
	parsed_params.push_back("utm_campaign");
	parsed_params.push_back("utm_city");
	parsed_params.push_back("utm_content");
	parsed_params.push_back("utm_medium");
	parsed_params.push_back("utm_phone");
	parsed_params.push_back("utm_source");
	parsed_params.push_back("utm_term");
	parsed_params.push_back("utm_referrer");

    return parsed_params;
}

TString CutChosenCGIParams(const TString& url, const TVector<TString>& params) {
	if (params.size() == 0)
		return CutLastSlash(url);

	NUri::TUri uri;
	if (uri.Parse(url, NUri::TFeature::FeaturesRecommended) != NUri::TState::EParsed::ParsedOK) {
			return CutLastSlash(url);
	}
	TCgiParameters cgiQueryParams(uri.GetField(NUri::TField::FieldQuery));	
	
	TVector<TString> params_to_erase;
	for (auto it = params.begin(); it != params.end(); ++it) {
		for (auto j = cgiQueryParams.begin(); j != cgiQueryParams.end(); ++j) {
			if (CheckCgi(j->first, *it))
				params_to_erase.push_back(j->first);
		}
	}

	for (auto it = params_to_erase.begin(); it != params_to_erase.end(); ++it) {
		cgiQueryParams.Erase(*it);
	}
	auto new_params = cgiQueryParams.Print();

	if (new_params.size() == 0)
		return CutLastSlash(url.substr(0, url.find("?")));
	else 
		return CutLastSlash(url.substr(0, url.find("?")) + "?" + new_params);
}

} // namespace

TString CanonizeUrl(const TString& url, const TString& params, const TString& importantCgiParams) {
	TVector<TString> parsed_params = GetParamsFromString(params);
    if (parsed_params.empty()) {
		AddExtraCGI(parsed_params);
	}

	auto pure_url = CutLastSlash(CutSchemeWwwMobile(url));
	auto domain = GetDomain(pure_url);
	LowerAndPunycodeUrl(pure_url, domain);
	pure_url = DecodeQuestionAndEq(pure_url);

	if (domain == "ozon.ru")
		pure_url = CanonizeOzonUrl(pure_url);
	else if (domain == "wildberries.ru")
		pure_url = CanonizeWBUrl(pure_url);
	else if (domain == "diamondelectric.ru")
		pure_url = CanonizeDiamondelectricUrl(pure_url);
	else if (domain == "aliexpress.ru")
		pure_url = CanonizeAliexpressUrl(pure_url);
	else if (domain ==  "mvideo.ru")
		pure_url = CanonizeMvideoUrl(pure_url);
	else if (domain == "vseinstrumenti.ru")
		pure_url = CanonizeVseinstrumentiUrl(pure_url);
	else if (domain == "svyaznoy.ru")
		pure_url = CanonizeSvyaznoyUrl(pure_url);
	else if (domain == "sbermegamarket.ru")
		pure_url = CanonizeSbermegamarketUrl(pure_url);
	else if (domain == "zdravcity.ru")
		pure_url = CutLastSlash(CutCGIParams(pure_url));
	else if (domain == "citilink.ru")
		pure_url = CanonizeCitilinkUrl(pure_url);
	else if (domain == "eldorado.ru")
		pure_url = CutLastSlash(CutCGIParams(pure_url));
	else if (domain == "carlon.ru")
		pure_url = CutRegionFromUrl(pure_url);
	else if (domain == "evropharm.ru")
		pure_url = CutRegionFromUrl(pure_url);
	else if (domain == "megapteka.ru")
		pure_url = CutRegionFromUrl(pure_url);
	else if (domain == "sidex.ru")
		pure_url = CanonizeSidexUrl(pure_url);
	else if (domain == "ozon.onelink.me")
		pure_url = CanonizeOzonOneLinkUrl(pure_url, domain);
	else if (domain == "sportmaster.ru")
		parsed_params.push_back("product_id");
	else if (domain == "farfetch.com")
		parsed_params.push_back("size");

	auto cgiToLeave = GetParamsFromString(importantCgiParams);
	parsed_params.erase(
		std::remove_if(
			parsed_params.begin(), 
			parsed_params.end(), 
			[&cgiToLeave](const TString& cgi) {
				return std::find(cgiToLeave.begin(), cgiToLeave.end(), cgi) != cgiToLeave.end();
			}), 
		parsed_params.end()
	);

	return CutChosenCGIParams(pure_url, parsed_params);
}
