package ru.yandex.tours.tools.merging.corpus

import ru.yandex.tours.tools.{HotelAware, Tool}
import ru.yandex.tours.util.IO
import ru.yandex.tours.util.Randoms._
import ru.yandex.tours.util.parsing.Tabbed

object NearHotelExtractor extends Tool with HotelAware {

  val f = 4000d / shardedHotelsIndex.size

  val negativePerHotel = 5

  val sample = shardedHotelsIndex.hotels.sample(f)

  IO.printFile("data/near_hotel_corpus_2.tsv") { pw =>
    for (current <- sample) {
      val pi = current.partnerIds
      if (pi.size >= 2) {
        val Seq(p1, p2) = pi.sample(2).toSeq
        pw.println(Tabbed(p1.travelId, p2.travelId, 1))
      }

      shardedHotelsIndex.near(current, 30).toSeq.sample(negativePerHotel).foreach { near =>
        val id1 = current.id
        val id2 = near.id
        val isSame = (for {
          a <- current.name.allValues
          b <- near.name.allValues
        } yield a == b).contains(true)

        if (!isSame) {
          pw.println(Tabbed(id1, id2, 0))
        }
      }
    }
  }
  sys.exit()
}
