package ru.yandex.tours.hotels.clustering.features

import ru.yandex.tours.hotels.clustering.ClusteringContext
import ru.yandex.tours.ml.FeatureExtractor
import ru.yandex.tours.model.hotels.HotelsHolder.PartnerHotel
import ru.yandex.tours.util.naming.HotelNameUtils

import scala.collection.JavaConverters._

object NameShingleFeature extends FeatureExtractor[ClusteringContext] {

  override def apply(ctx: ClusteringContext): Double = {
    val aShingles = ctx.context1.cleanedNameShingles
    val bShingles = ctx.context2.cleanedNameShingles

    val result = for {
      aShingles <- aShingles
      bShingles <- bShingles
    } yield jacardIndex(aShingles, bShingles)
    if (result.nonEmpty) result.min else -1d
  }

  private def jacardIndex(a: Set[String], b: Set[String]): Double = {
    if (a.isEmpty && b.isEmpty) {
      0
    } else {
      1 - 1.0 * (a & b).size / (a | b).size
    }
  }

  private def splitInShingles(words: Array[String]) = {
    (for {
      word <- words
      shingle <- word.sliding(3)
    } yield shingle).toSet
  }

  override def name: String = "name_shingle"
}
