package ru.yandex.tours.indexer.clusterization

import ru.yandex.tours.clustering.Clustering.LinkWithConfidence
import ru.yandex.tours.db._
import ru.yandex.tours.db.dao.HotelsDao
import ru.yandex.tours.db.dao.HotelsDao.{IsNew, SkipDeleted}
import ru.yandex.tours.db.tables.{ClusterLink, Clusterization}
import ru.yandex.tours.indexer.task.{AsyncUpdatable, TaskWeight}
import ru.yandex.tours.model.hotels.HotelsHolder.PartnerHotel
import ru.yandex.tours.model.hotels.Partners
import ru.yandex.tours.util.IO
import ru.yandex.tours.util.concurrent.BatchExecutor

import scala.concurrent.duration.FiniteDuration
import scala.concurrent.{ExecutionContext, Future}
import scala.util.Random
import scala.util.control.NonFatal

class HotelClusterizer(dbWrapper: DBWrapper,
                       hotelsDao: HotelsDao,
                       clusterizerFactory: ClusterizerFactory,
                       updateTime: FiniteDuration)
                      (implicit ec: ExecutionContext)
  extends AsyncUpdatable(updateTime, "cluster_hotels") with TaskWeight.Unique {

  private val BATCH_SIZE = 1000
  private val MAX_HOTELS_TO_CLUSTER = 50000
  private val CONFIDENCE_THRESHOLD = 0.5

  def run(): Future[Unit] = {
    log.info("Start retrieving grids")
    val grids = for {
      masterGrid <- getGrid(isNewHotels = false)
      slaveGrid <- getGrid(isNewHotels = true)
    } yield (masterGrid, slaveGrid)
    grids.flatMap {
      case (masterGrid, slaveGrid) =>
        log.info(s"Start merging $slaveGrid into $masterGrid")
        val clusterizer = clusterizerFactory(masterGrid, slaveGrid)
        val hotelsToCluster =
          if (slaveGrid.size > MAX_HOTELS_TO_CLUSTER) {
            val offset = Random.nextInt(slaveGrid.size - MAX_HOTELS_TO_CLUSTER)
            slaveGrid.iterator.slice(offset, offset + MAX_HOTELS_TO_CLUSTER)
          } else slaveGrid.iterator
        val result = for {
          links <- clusterizer.getMergeResult(hotelsToCluster)
          cleaned <- cleanLinks(links, masterGrid.iterator ++ slaveGrid.iterator)
          _ <- saveLinks(cleaned)
        } yield ()
        result.andThen {
          case _ =>
            log.info("Hotels clustered!")
            masterGrid.close()
            slaveGrid.close()
        }
    }
  }

  private def cleanLinks(links: Seq[LinkWithConfidence],
                         hotels: Iterator[PartnerHotel]): Future[Seq[LinkWithConfidence]] = {
    for {
      oldLinks <- Clusterization.retrieveClusterLinks(Clusterization.defaultMinConfidence, dbWrapper, "Clustering")
    } yield {
      val hotelToPartner = hotels.map(h => h.getId -> Partners(h.getRawHotel.getPartner)).toMap
      val cleaner = new HotelLinkCleaner(oldLinks, hotelToPartner)
      cleaner.removeExcessLinksAndSetConfidence(links)
    }
  }

  private def saveLinks(links: Iterable[LinkWithConfidence]): Future[Unit] = {
    Transactions.withTransaction(dbWrapper) { transaction =>
      val linksToAdd = links.map { l =>
        ClusterLink(0, l.parent, l.child, transaction.id, l.confidence)
      }
      val idsToUpdate = links.filter(_.confidence >= CONFIDENCE_THRESHOLD).flatMap(l => Seq(l.child, l.parent)).toSet
      for {
        linksCount <- BatchExecutor.executeInBatch[ClusterLink](linksToAdd.iterator,
                                                                "cluster links inserted",
                                                                BATCH_SIZE,
                                                                tables.Clusterization.insert(dbWrapper, _))
        hotelsAdded <- BatchExecutor.executeInBatch[Int](idsToUpdate.iterator,
                                                         "hotels published",
                                                         BATCH_SIZE,
                                                         hotelsDao.publish)
      } yield {
        log.info(s"Clusterization done. $linksCount links added to db. $hotelsAdded hotels added.")
      }
    }
  }

  private def getGrid(isNewHotels: Boolean): Future[GridInFile[PartnerHotel]] = {
    val name = if (isNewHotels) "new" else "old"
    log.info(s"Start retrieving grid for $name hotels")
    val gridFolder = IO.newTempDir()

    hotelsDao.retrieveRafMap(IsNew(isNewHotels), SkipDeleted).map { rafMap =>
      log.info(s"$name hotels retrieved")
      try {
        val hotels = rafMap.valuesIterator.filter(_.getRawHotel.hasPoint)
        Grid.dumpToFolder[PartnerHotel](gridFolder.getAbsolutePath, hotels, _.getRawHotel.getPoint)
        log.info(s"$name hotels dumped to grid")
        Grid.readFromFolder[PartnerHotel](gridFolder.getAbsolutePath, PartnerHotel.PARSER)
      } catch {
        case NonFatal(e) => IO.deleteFile(gridFolder); throw e
      } finally {
        rafMap.close()
      }
    }
  }

  override protected def update: Future[_] = run()
}
