package ru.yandex.tours.hotels.clustering

import java.net.URL

import ru.yandex.common.util.StringUtils
import ru.yandex.common.util.StringUtils._
import ru.yandex.tours.geo
import ru.yandex.tours.hotels.HotelsIndex
import ru.yandex.tours.model.BaseModel.Point
import ru.yandex.tours.model.hotels.HotelsHolder
import ru.yandex.tours.model.hotels.HotelsHolder.{HotelType, PartnerHotel}
import ru.yandex.tours.util.naming.{HotelNameUtils, TfIdfModel}
import HotelContext._

import scala.collection.JavaConverters._
import scala.util.Try

/**
 * Author: Vladislav Dolbilov (darl@yandex-team.ru)
 * Created: 29.04.16
 */
case class ClusteringContext(context1: HotelContext,
                             context2: HotelContext) {

  val distance = for {
    point1 <- context1.point
    point2 <- context2.point
  } yield geo.distanceInKm(point1, point2)
}

class LocalContext(hotels: Iterator[PartnerHotel]) {
  private var _count = 0
  private val localTfIdfBuilder = TfIdfModel.newBuilder()
  private val localAddressTfIdfBuilder = TfIdfModel.newBuilder()

  for (hotel ← hotels) {
    _count += 1
    names(hotel).map(words).foreach(localTfIdfBuilder += _)
    addresses(hotel).map(words).foreach(localAddressTfIdfBuilder += _)
  }

  val localTfIdf = localTfIdfBuilder.result()
  val localAddressTfIdf = localAddressTfIdfBuilder.result()
  val nearCount = _count
}

class HotelContext(hotel: PartnerHotel, localContext: LocalContext) {

  val partnerId = hotel.getRawHotel.getPartner

  val point: Option[Point] =
    if (hotel.getRawHotel.hasPoint) Some(hotel.getRawHotel.getPoint).filterNot(HotelsIndex.isEmptyPoint)
    else None

  val stars: Option[Int] = if (hotel.getRawHotel.hasStars) Some(hotel.getRawHotel.getStars) else None

  val hasType = hotel.hasType
  val hotelType = hotel.getType

  val cleanedNames: Seq[String] = names(hotel)

  val cleanedNameWords = cleanedNames.map(splitName)

  val cleanedNameWordsFlatten = cleanedNameWords.flatten

  val cleanedNameShingles = cleanedNameWords.map(HotelContext.shingles)

  private val fullAddresses: Seq[String] = addresses(hotel)

  val addressDigits = fullAddresses.map(_.split("[^0-9]+").filter(_.nonEmpty).toSet)

  val phones = hotel.getRawHotel.getPhoneList.asScala.map(cleanPhone).filter(_.nonEmpty)

  val cleanedNameWordsVector = localContext.localTfIdf.tfIdf(cleanedNameWordsFlatten)

  private val cleanedAddressWords = fullAddresses.flatMap(splitName)
  val cleanedAddressVector = localContext.localAddressTfIdf.tfIdf(cleanedAddressWords)

  val nearHotelsCount = localContext.nearCount

  val pHashes = hotel.getImagesList.asScala.filter(_.hasPHash).map(_.getPHash)
  val nnFeatures = hotel.getImagesList.asScala
    .filter(_.getNNetFeaturesCount > 0)
    .map(_.getNNetFeaturesList.asScala.toArray.map(_.doubleValue()))

  val host = if (hotel.getRawHotel.hasHotelUrl) {
    Try(new URL(hotel.getRawHotel.getHotelUrl).getHost.stripPrefix("www.")).toOption
  } else {
    None
  }

  val isApart = {
    (hotelType == HotelType.APARTMENTS) || (hotelType == HotelType.APARTHOTEL) || {
      cleanedNames.exists(name ⇒ apartRegext.findFirstIn(name).nonEmpty)
    }
  }
}

object HotelContext {
  private[clustering] val camelCase = """([\p{IsLowercase}\d])(\p{IsUppercase})""".r
  private[clustering] val phoneCleanRegexp = """[^0-9]""".r
  private[clustering] val apartRegext = """(?i)(апарт|apart)""".r


  private[clustering] def cleanPhone(phone: String) = phoneCleanRegexp.replaceAllIn(phone, "")

  private[clustering] def cleanName2(a: String): String = removeAccents(normalizeWhitespace(a))

  private[clustering] def translit(a: String): String = StringUtils.translit(RUS_TO_ENTRANSLIT, a)


  private[clustering] def names(hotel: PartnerHotel): Seq[String] = {
    val rawHotel = hotel.getRawHotel
    val names = rawHotel.getNameList.asScala.map(_.getValue) ++ rawHotel.getSynonymsList.asScala.map(_.getValue)
    (for {
      name <- names
      split <- HotelNameUtils.splitNameToNames(name)
      lower = split.toLowerCase
      toClean <- Seq(lower, translit(lower))
    } yield cleanName2(toClean)).distinct
  }

  private[clustering] def splitCamelCase(name: String): String = camelCase.replaceAllIn(name, m => m.group(1) + " " + m.group(2))

  private[clustering] def splitName(name: String) = HotelNameUtils.splitNameToWords(convertPunctuationToSpaces(name))

  private[clustering] def words(name: String) = splitName(name).toSeq

  private[clustering] def shingles(words: Array[String]) = {
    (for {
      word <- words
      shingle <- word.sliding(3)
    } yield shingle).toSet
  }


  private[clustering]def joinAddress(addr: HotelsHolder.Address): String = {
    val sb = new StringBuilder
    sb ++= addr.getCountry ++= " "
    sb ++= addr.getAdminName ++= " "
    sb ++= addr.getLocality ++= " "
    sb ++= addr.getStreet ++= " "
    sb ++= addr.getHouse ++= " "
    sb ++= addr.getFullAddress
    sb.toString.toLowerCase
  }

  private[clustering] def addresses(hotel: PartnerHotel) = {
    hotel.getRawHotel.getAddressList.asScala
      .map(joinAddress)
      .map(splitCamelCase)
      .map(convertPunctuationToSpaces)
      .flatMap(name => Seq(name, translit(name)))
      .map(cleanName2)
  }.distinct.filter(_.nonEmpty)


  def apply(hotel: PartnerHotel, localContext: LocalContext): HotelContext = {
    new HotelContext(hotel, localContext)
  }
}