package ru.yandex.tours.tools.merging

import java.io.File

import ru.yandex.tours.model.hotels.HotelsHolder.PartnerHotel
import ru.yandex.tours.model.hotels.Partners
import ru.yandex.tours.model.hotels.Partners.Partner
import ru.yandex.tours.util.{IO, ProtoIO}
import ru.yandex.tours.util.parsing.Tabbed

import scala.collection.mutable
import scala.util.Try

object TolokaAnalyser extends App {
  val travelRegexp = """https://travel.yandex.ru/hotel/(\d+)(\?context=rooms)?""".r

  val url2idMap = mutable.Map.empty[String, Int]
  val id2url = mutable.Map.empty[Int, String]
  val id2partnerId = mutable.Map.empty[Int, (Int, String)]

  def plainHotel(file: String): Iterator[PartnerHotel] = {
    ProtoIO.loadFromFile(new File(file), PartnerHotel.PARSER)
  }

  def collectMaps(file: String): Unit = {
    plainHotel(file).foreach { hotel =>
      url2idMap += hotel.getRawHotel.getPartnerUrl -> hotel.getId
      id2url += hotel.getId -> hotel.getRawHotel.getPartnerUrl
      id2partnerId += hotel.getId ->(hotel.getRawHotel.getPartner, hotel.getRawHotel.getPartnerId)
    }
  }

  def readUrl2IdMatcher(file: String): Map[String, Int] = {
    plainHotel(file).map(h => h.getRawHotel.getPartnerUrl -> h.getId).toMap
  }

  def readId2Url(file: String): Map[Int, String] = {
    plainHotel(file).map(h => h.getId -> h.getRawHotel.getPartnerUrl).toMap
  }

  collectMaps("partner_hotels.proto")


  private def url2id(url: String): Int = {
    val matcher = travelRegexp.pattern.matcher(url)
    if (matcher.matches()) {
      matcher.group(1).toInt
    } else {
      url2idMap(url)
    }
  }

  private def getIsSame(x: String): Option[Boolean] = {
    x match {
      case "same" => Some(true)
      case "different" => Some(false)
      case _ => None
    }
  }

  case class Link(head: Int,
                  headUrl: String,
                  slave: Int,
                  partner: Partner,
                  partnerId: String,
                  partnerUrl: String,
                  isMerge: Boolean)

  def readTolokaResults(file: String): Iterable[Link] = {
    val lines: Iterator[String] = scala.io.Source.fromFile(file).getLines()
    lines.next()
    lines.flatMap {
      case Tabbed(url1, url2, result, goldenSet, _, _, _, _, _, _) =>
        for {
          isSame <- getIsSame(result)
          slave <- Try(url2id(url2)).toOption
          master <- Try(url2id(url1)).toOption
        } yield {
          val (partner, partnerId) = id2partnerId(slave)
          val isSame = goldenSet.isEmpty && result == "same"
          Link(master, url1, slave, Partners(partner), partnerId, url2, isSame)
        }
    }.toList
  }


  IO.printFile("toloka_corpus.tsv") { pw =>
    readTolokaResults("/Users/berkut/Downloads/toloka.csv").foreach { link =>
      pw.println(Tabbed(link.head, link.slave, if (link.isMerge) 1 else 0))
    }
  }

  //  Statistics.distribution[Link, String]("heads", resultsToApply, extract(_))
}
