package ru.yandex.tours.indexer.ml

import ru.yandex.extdata.loader.engine.DataPersistenceManager
import ru.yandex.tours.extdata.DataTypes
import ru.yandex.tours.hotels.HotelsIndex
import ru.yandex.tours.indexer.clusterization.ComparingUtil
import ru.yandex.tours.indexer.clusterization.ComparingUtil._
import ru.yandex.tours.indexer.task.{PeriodicUpdatable, TaskWeight}
import ru.yandex.tours.model.hotels.Hotel
import ru.yandex.tours.util.IO
import ru.yandex.tours.util.naming.{HotelNameUtils, TfIdfModel}

import scala.concurrent.duration.FiniteDuration

/**
 * Author: Vladislav Dolbilov (darl@yandex-team.ru)
 * Created: 29.04.16
 */
class HotelsTfIdfIndexer(hotelsIndex: HotelsIndex,
                         dataPersistenceManager: DataPersistenceManager,
                         updateTime: FiniteDuration)
  extends PeriodicUpdatable(updateTime, "hotels_tf_idf") with TaskWeight.Medium {

  private def names(hotel: Hotel): Seq[String] = {
    val names = hotel.name.allValues ++ hotel.synonyms
    names.flatMap { name =>
      HotelNameUtils.splitNameToNames(name).flatMap { n2 =>
        val lower = n2.toLowerCase
        Seq(cleanName2(lower), cleanName2(translite(lower)))
      }
    }.distinct
  }

  override def run(): Unit = {
    val model = TfIdfModel.build(hotelsIndex.hotels.flatMap(names).map(ComparingUtil.split))
    val stream = IO.writeStream(os => model.saveTo(os))
    dataPersistenceManager.checkAndStore(DataTypes.hotelsTfIdfModel, stream)
  }
}
