package ru.yandex.tours.hotels.clustering

import java.io.InputStream

import breeze.numerics.sigmoid
import ru.yandex.extdata.common.meta.DataType
import ru.yandex.tours.extdata.{DataDefWithDependencies, DataTypes}
import ru.yandex.tours.hotels.clustering.features.{StarsFeature, UrlHostFeature, _}
import ru.yandex.tours.ml.{Features, MLModel}
import ru.yandex.tours.util.naming.TfIdfModel
import ru.yandex.vertis.matrixnet.{Formula, MnFormulaReader}
import shapeless._

/**
 * Author: Vladislav Dolbilov (darl@yandex-team.ru)
 * Created: 06.05.16
 */
class ClusteringModel(tfIdfModel: TfIdfModel, formula: Formula) extends MLModel[ClusteringContext] {
  override def version: Int = ClusteringModel.dataType.getCurrentFormatVersion

  override def description: String =
    """Matrixnet model to classify duplicate hotels.
      |
      |Returns value in range [0, 1] with probability that hotels are equal.
    """.stripMargin

  override def features: Features[ClusteringContext] = {
    new Features(
      new NameTfIdfCosFeature(tfIdfModel),
      LocalTfIdfCosFeature,
      AddressNumberFeature,
      AddressTfIdfCosFeature,
      SamePartnerFeature,
      LocalDensityFeature,
      HotelTypesFeature,
      BoundedDistanceFeature,
      PHashNearestFeature,
      PHashAucFeature,
      CountOfSimilarOnPHashFeature,
      NNFeaturesNearestCosFeature,
      NNFeaturesCosAucFeature,
      LevenshteinFeature,
      NameSetFeature,
      PhoneSuffixFeature,
      NameShingleFeature,
      UrlHostFeature,
      StarsFeature,
      IsApartAnyFeature,
      IsApartBoothFeature,
      IsBackaFeature
    )
  }

  override def apply(features: Array[Double]): Double = {
    val res = formula.calc(features)
    sigmoid(res)
  }
}

object ClusteringModel extends DataDefWithDependencies[ClusteringModel, TfIdfModel :: HNil] {
  override def dataType: DataType = DataTypes.clusteringMatrixnetModel

  override def dependsOn: Set[DataType] = Set(DataTypes.hotelsTfIdfModel)

  override def parse(is: InputStream, dependencies: TfIdfModel :: HNil): ClusteringModel = {
    val tfIdf :: HNil = dependencies
    val formula = new MnFormulaReader().load(is)

    new ClusteringModel(tfIdf, formula)
  }
}
