package ru.yandex.tours.indexer.clusterization

import ru.yandex.common.util.StringUtils._
import ru.yandex.tours.model.BaseModel.ProtoImage
import ru.yandex.tours.model.hotels.HotelsHolder.PartnerHotel
import ru.yandex.tours.util.Vectors
import ru.yandex.tours.util.naming.HotelNameUtils

import scala.collection.JavaConverters._

object ComparingUtil {

  private val camelCase = """([\p{IsLowercase}\d])(\p{IsUppercase})""".r

  val ignoreWords = Set("rooms", "suites", "villas", "виллы", "apartments", "studios", "апартаменты",
    "dormitory", "hotel", "cottages", "bungalows", "коттеджи", "бунгало", "inn", "apartment",
    "home", "villa", "motel", "hostel", "residence", "guesthouse", "plaza", "hôtel",
    "apartamentos", "отель", "cottage", "hostal", "haus", "hotels", "apartamento", "aparthotel",
    "appartement", "homestay", "résidence", "дом", "resorts", "apartament", "гостиница", "хостел",
    "homes", "flat", "room", "gästehaus", "penzion", "d'hôtes", "appartements", "dom", "residences",
    "apartman", "bungalow", "hotel,", "мини-отель", "hotell", "otel", "mini-hotel", "the")

  val MAX_DISTANCE_IN_KM_SPARSE = 2.5
  val MAX_DISTANCE_IN_KM_TIGHT = 0.5

  def getMaxDistance(numberOfHotelsInGrid: Int): Double = {
    MAX_DISTANCE_IN_KM_SPARSE
  }

  def isSubStr(a: String, b: String): Boolean = {
    a.indexOf(b) >= 0
  }

  def splitCamelCase(name: String): String = {
    camelCase.replaceAllIn(name, m => m.group(1) + " " + m.group(2))
  }

  def splitByNonDigits(name: String): Array[String] = name.split("[^0-9]+").filter(_.nonEmpty)

  def split(name: String): Array[String] = {
    HotelNameUtils.splitNameToWords(convertPunctuationToSpaces(name))
  }

  private def removeMarkers(a: String) = {
    split(a).filterNot(ComparingUtil.ignoreWords.contains).mkString(" ")
  }

  def cleanName(a: String): String = removeMarkers(cleanName2(a.toLowerCase))
  def cleanName2(a: String): String = removeAccents(normalizeWhitespace(a.toLowerCase))

  def cleanNames(x: Iterable[String]): Iterable[String] = x.map(cleanName)

  def getCleanedNames(x: PartnerHotel): Iterable[String] = {
    val rawHotel = x.getRawHotel
    val names = rawHotel.getNameList.asScala.map(_.getValue) ++ rawHotel.getSynonymsList.asScala.map(_.getValue)
    (for {
      name <- names
      split <- HotelNameUtils.splitNameToNames(name)
      toClean <- Seq(split, translite(split))
    } yield cleanName(toClean)).toSet
  }

  def translite(a: String): String = {
    translit(RUS_TO_ENTRANSLIT, a.toLowerCase)
  }

  def jacardIndex(a: Set[String], b: Set[String]): Double = {
    if (a.isEmpty && b.isEmpty) {
      0
    } else {
      1 - 1.0 * (a & b).size / (a | b).size
    }
  }

  def bitCompare(a: String, b: String): Int = {
    val diffs: Array[Int] = a.getBytes.zip(b.getBytes).map {
      case (x, y) =>
        var diff = x ^ y
        var result = 0
        while (diff > 0) {
          result += diff & 1
          diff = diff >> 1
        }
        result
    }
    diffs.sum
  }

  def getPHashes(x: PartnerHotel): Iterable[String] = {
    x.getImagesList.asScala.flatMap { x =>
      if (x.hasPHash) Some(x.getPHash) else None
    }
  }

  def imageDistance(a: ProtoImage, b: ProtoImage): Double = {
    // there are total 96 numbers. Each number is from -1 to 1. Max difference is 2.
    // So maximum distance is sqrt(96 * 4)
    if (a.getNNetFeaturesCount == 0 || b.getNNetFeaturesCount == 0) {
      1
    } else {
      assert(a.getNNetFeaturesCount == 96 && b.getNNetFeaturesCount == 96, "Wrong number of NN features")
      val sum = a.getNNetFeaturesList.asScala.zip(b.getNNetFeaturesList.asScala).map {
        case (x, y) => (x - y) * (x - y)
      }.sum
      Math.sqrt(sum / 96 / 4)
    }
  }

  def imageCosSimilarity(a: ProtoImage, b: ProtoImage): Double = {
    // there are total 96 numbers. Each number is from -1 to 1. Max difference is 2.
    // So maximum distance is sqrt(96 * 4)

    if (a.getNNetFeaturesCount == 0 || b.getNNetFeaturesCount == 0) {
      0
    } else {
      assert(a.getNNetFeaturesCount == 96 && b.getNNetFeaturesCount == 96, "Wrong number of NN features")

      Vectors.cos(
        a.getNNetFeaturesList.asScala.toArray.map(_.doubleValue()),
        b.getNNetFeaturesList.asScala.toArray.map(_.doubleValue())
      )
    }
  }
}
