package ru.yandex.tours.indexer.clusterization.similarity

import ru.yandex.tours.indexer.clusterization.ComparingUtil
import ru.yandex.tours.indexer.clusterization.ComparingUtil._
import ru.yandex.tours.model.hotels.HotelsHolder.PartnerHotel
import ru.yandex.tours.util.naming.HotelNameUtils

import scala.collection.JavaConverters._
import scala.collection.mutable

object ShingleSimilarity extends HotelSimilarity {
  /**
   *
   * @param a - first hotel
   * @param b - second hotel
   * @return - minimal Jaccard index between hotel name shingles
   */
  override def similarity(a: PartnerHotel, b: PartnerHotel): Double = {
    if (a.getRawHotel.getNameCount == 0 || b.getRawHotel.getNameCount == 0) return 1
    val aShingles = getShingles(a)
    val bShingles = getShingles(b)
    val doubles = for {
      aShingles <- aShingles
      bShingles <- bShingles
    } yield jacardIndex(aShingles, bShingles)
    doubles.min
  }

  private def allNames(a: PartnerHotel) = {
    a.getRawHotel.getNameList.asScala.map(_.getValue) ++ a.getRawHotel.getSynonymsList.asScala.map(_.getValue)
  }

  private def getShingles(a: PartnerHotel): Seq[Set[String]] = {
    for {
      rawName <- allNames(a)
      splitName <- HotelNameUtils.splitNameToNames(rawName)
      lower = splitName.toLowerCase
      name <- Seq(lower, translite(lower))
      shingles = splitInShingles(name)
    } yield shingles
  }

  private def splitInShingles(a: String): Set[String] = {
    val cleaned = cleanName(a)
    (for {
      word <- ComparingUtil.split(cleaned)
      shingle <- word.sliding(3)
    } yield shingle).toSet
  }

  override def name: String = "name_shingle"
}
