package ru.yandex.tours.tools.merging

import java.io.File

import ru.yandex.tours.model.hotels.HotelsHolder.PartnerHotel
import ru.yandex.tours.util.{ProtoIO, IO}
import ru.yandex.tours.util.parsing.Tabbed

object HotelUrlToIdConverterTool extends App {

  val travelPrefix: String = "https://travel.yandex.ru/hotel/"

  val url2id = ProtoIO.loadFromFile(new File("partner_hotels.proto"), PartnerHotel.PARSER).map { h =>
    h.getRawHotel.getPartnerUrl -> h.getId
  }.toMap

  private def getId(url: String): Option[Int] = {
    if (url.startsWith(travelPrefix)) {
      Some(url.slice(travelPrefix.length, travelPrefix.length + 7).toInt)
    } else {
      url2id.get(url)
    }
  }

  private val lines: Iterator[String] = IO.readLines("/Users/berkut/Downloads/toloka.csv")
  lines.next()
  val result = lines.flatMap { line =>
    val parts = line.split("\t")
    val votesSame = parts(4).toInt
    val votesDifferent = parts(5).toInt
    val isDifferent = if (parts(2) == "different") true else false
    for {
      id1 <- getId(parts(0))
      if isDifferent && votesSame == 2
      id2 <- getId(parts(1))
    } yield (id1, id2, isDifferent)
  }

  IO.printFile("clustering_corpus.tsv") { pw =>
    result.foreach {
      case (id1, id2, isDifferent) => pw.println(Tabbed(id1, id2, if (isDifferent) 0 else 1))
    }
  }
}
