require(bit64)
library(data.table)
library(reshape2)
library(ggplot2)
library(scales)
library(knitr)
library(parallel)
library(chron)


data_defaults = c(page="1+",ar_verdict="EVERYBODY",user_agent="all_ua",dc="<all>",tld="tld-all",ajax="ajax-all",https="http-all",turbo="turbo-all",visibility="visible",ua_version="version_all",region="region-all")

data_list_filter = function(data, filter, debug=FALSE) {
  expr = character(0)
  
  for (i in 1:length(filter))
  {
    foo=filter[i]
    name=names(foo)
    if (length(unlist(foo))>1) {
      values=paste0(foo)      
      expr1=paste(name,"%in%",values)
    }else {  
      foo=unlist(foo)
      if (is.numeric(foo)) {
        values=paste0(foo)
      }else {
        values=paste0('"',foo,'"')
      }
      expr1=paste(name,"==",values)
    }
    
    
    expr=c(expr,expr1)
  }
  
  expr=paste(expr,collapse=' & ')
  expr=paste("data=data[",expr,",]")
  #print(expr)
  eval(parse(text=expr),envir=environment())
  if (debug) return(list(data=data,expr=expr))
  else return(data)
  
}

#autocalc could be subset of: c("weekday","week","ymdf","auc","chron")
data_prep = function(data, need_fields, filter) {
    
  local_defaults=data_defaults
  if (!(local_defaults["https"] %in% unique(data$https))) {
    local_defaults["https"]="http"
  }
  if (!(local_defaults["ajax"] %in% unique(data$ajax))) {
    local_defaults["ajax"]="not-ajax-query"
  }
  if (!(local_defaults["visibility"] %in% unique(data$visibility))) {
    local_defaults["visibility"]="visibility-all"
  }
  
  if ("user_agent_turbo" %in% need_fields) {
    need_fields=c(need_fields,"user_agent","turbo")
    need_fields=unique(need_fields)
  }
  
  p=NULL
  if ("p" %in% need_fields){
    p=vapply(0:99,function(x) sprintf('p%02d',x),character(1))
    need_fields=c(need_fields,p)
    need_fields=need_fields[need_fields!='p' & need_fields!='value']
    need_fields=unique(need_fields)
  }
  pp=NULL
  if ("pp" %in% need_fields){
    pp=vapply(c(10,20,30,40,50,60,70,80,90,95),function(x) sprintf('p%02d',x),character(1))
    need_fields=c(need_fields,pp)
    need_fields=need_fields[need_fields!='pp' & need_fields!='value']
    need_fields=unique(need_fields)
  }
  
  #todo: change to data_list_filter 
  #data=data_list_filter(data,local_defaults)
  names = names(local_defaults)
  names = local_defaults[!is.element(names,need_fields) & is.element(names,names(data))]
  pairs = lapply(names(names),function(name) paste0(name,"=='",local_defaults[[name]],"'"))
  pairs = paste0(pairs,collapse=" & ")
  pairs = paste0("data=data[",pairs,",]")
  eval(parse(text=pairs))    
      
  if ("mean" %in% need_fields) {
    data[,mean:=auc/100]
  }
  
  #if (!("auc" %in% need_fields) & ) {
  #  data[,auc:=NULL]
  #}
  
  if ("weekday" %in% need_fields ) {
    data$weekday = weekdays(data$date)  
  }
  
  if ("week" %in% need_fields) {
    data$week = week(data$date)  
  }
  
  if ("ymdf" %in% need_fields) {
    data$ymdf = chron(data$date,0)
    data$ymdf = format(data$ymdf,format="%Y-%m-%d")
    data$ymdf = factor(x=data$ymdf, levels=unique(data$ymdf), labels=unique(data$ymdf), ordered=TRUE)              
  }
  
  if ("chron" %in% need_fields)  {
    data$chron = chron(data$date,data$time)
  }

  if ("user_agent_turbo" %in% need_fields )  {
    data[, user_agent_turbo := paste(user_agent,turbo,sep="-") ]
  }

  if ("service_https" %in% need_fields )  {
    data[, service_https := paste(service,https,sep="-") ]
  }
  
  data = data_list_filter(data,filter)
  data = data[,need_fields,with=FALSE]
  
  #remove defaulted columns
  #remove=names(names)
  #if (length(remove)>0) {
  #  data[,(remove):=NULL]    
  #}
  
  if (!is.null(p)) {
    data=as.data.table(melt(data,measure.vars=p,variable.name='variable',value.name='value'))
    data[,p:=as.integer(substring(variable,2))]
    data$variable=NULL
  }

  if (!is.null(pp)) {
    data=as.data.table(melt(data,measure.vars=pp,variable.name='variable',value.name='value'))
    data[,pp:=as.integer(substring(variable,2))]
    data$variable=NULL
  }
  
  if ("time" %in% colnames(data)) {
    setkeyv(data, c("date","time") )  
  } else 
  if ("p" %in% colnames(data)) {
    setkeyv(data, c("date","p") )  
  }else
  if ("pp" %in% colnames(data)) {
    setkeyv(data, c("date","pp") )  
  }
  
  return (data)
}



data_squash = function(data, except=c(), autocalc=c("mean","auc"))
{
  return (data_prep(data, need_fields=c(except,autocalc)))
}

data_bind = function(tablenames,datalists)
{
  write("binding tables",file=stdout())
  
  columns = character(0)
  for (dt in datalists) {
    for (name in names(dt)) {
      if (!is.element(name,columns)) {
        columns[length(columns)+1]=name 
      }
    }
  }
  
  restables = list()
  
  for (num in 1:length(datalists)) {
    dt = datalists[[num]]
    tablename = tablenames[[num]]
    names = names(dt)
    for (name in columns)
    {
      if (!is.element(name,names))
      {
        value = data_defaults[[name]]
        len=nrow(dt)
        vector=rep(value,len)
        print(paste0(tablename,": adding column '",name,"' with values '",value,"'"))
        dt[,name:=vector,with=FALSE]
      }
    }
    write(paste0(columns,collapse=' '),file=stdout())
    dt = dt[,columns,with=FALSE]
    restables[[length(restables)+1]]=dt
  }
  
  table = rbindlist(restables)
  return(table)
}

speedindex_load = function(date, lower_date=NA)
{
  files = list.files("..","speedindex.*.txt",recursive=TRUE,full.names=TRUE)
  unused = files[substr(files,1,12) > paste0("../",date,"/")]
  files = files[substr(files,1,12) <= paste0("../",date,"/")]
  
  if (!is.na(lower_date)) {
    unused = c(unused,files[substr(files,1,12) <= paste0("../",lower_date,"/")])
    files = files[substr(files,1,12) >= paste0("../",lower_date,"/")]
  }  
  
  print("used speedindex files:")
  print(files)
  print("unused speedindex  files:")
  print(unused) 
  print("")
  
  tables=lapply(files, function(name) {
    print(name)
    foo=fread(name, sep="\t", header=FALSE, stringsAsFactors=TRUE)
    setnames(foo,c("ts","query","latency","revision"))
    foo$chron=as.chron(as.POSIXct(foo$ts/1000, origin = "1970-01-01", tz = "GMT"))
    foo$date=dates(foo$chron)
    foo$time=as.integer((as.numeric(foo$chron)-as.integer(foo$chron))*24)
    return(foo)
  })
  
  table = rbindlist(tables)
  gindex = table[,list(.N,latency=as.integer(median(latency))),keyby=c("query","date","revision")]
  findex = gindex[,list(count=sum(N),latency=as.integer(mean(latency))),keyby=c("date","revision")]  
  findex = findex[count>1000,]
  return(findex)
}

data_load = function(date,type="out", version="uniparse", remove_trash=TRUE,lower_date=NA)
{
  files = list.files("..",paste0(version,".",type,".*.txt"),recursive=TRUE,full.names=TRUE)
  print(files)
  unused = files[substr(files,1,12) > paste0("../",date,"/")]
  files = files[substr(files,1,12) <= paste0("../",date,"/")]
  if (!is.na(lower_date)) {
    unused= c(unused,files[substr(files,1,12) <= paste0("../",lower_date,"/")])
    files = files[substr(files,1,12) >= paste0("../",lower_date,"/")]
  }
  print("used files:")
  print(files)
  print("unused files:")
  print(unused) 
  print("")
  
  cl=makePSOCKcluster(names=getOption("mc.cores", 2L),outfile="cluster-debug-load.txt")
  
  clusterEvalQ(cl,require(bit64))
  clusterEvalQ(cl,library(data.table))
  clusterEvalQ(cl,library(reshape2))
  clusterEvalQ(cl,library(ggplot2))
  clusterEvalQ(cl,library(scales))
  clusterEvalQ(cl,library(parallel))
  clusterEvalQ(cl,library(chron))
  
  clusterExport(cl,"data_defaults",environment())
  
  #clusterEvalQ(cl,source('../../data_load.R'))
  
  tables=parLapply(cl,files, function(name) {
  #tables=lapply(files, function(name) {
    print(name) 
    foo=fread(name, sep="\t", header=TRUE, stringsAsFactors=TRUE)
    print(names(foo)) 
    if (is.element("page",names(foo)))
    {
      foo=foo[page=="1+"] #TODO: remove other pages to detailed file later      
    }
    
    if (remove_trash)
    {
      main_source = NA
      datasources = unique(foo$datasource)

      if ("profile_redir" %in% datasources) {
        main_source="profile_redir"
      } 
      else if ("redir" %in% datasources) {
        main_source="redir"
      }
      else if ("spylog" %in% datasources) {
        main_source="spylog"
      }
      else {
        stop("Not known main datasource")
      }
      
      #HACK fix this
      foo=foo[ datasource == main_source | grepl("morda",service) ]

      key_fields=c("datasource","service","stage","gran")
      
      for (i in names(foo)) {
        if (i %in% names(data_defaults)) {
          key_fields = c(key_fields,i)
        }
      }
      setkeyv(foo,key_fields)
      agg=foo[,list(len=sum(len),N=.N),keyby=key_fields]
      
      #check counters
      bar = agg[(N>24 & gran=="1hr") | (N>144 & gran=="10m") | (N>1440 & gran=="1m") | (N>1 & gran=="1d") | !(gran %in% c("1m","10m","1hr","1d"))]
      print(bar[1:100])
      stopifnot(nrow(bar)==0)
      
      agg[,trash := len<28800 | (N<20 & gran=="1hr") | (N<120 & gran=="10m") | (N<1200 & gran=="1m")]
      #all event which rarer than 20 queries per minute on average are trash
      
      kept=agg[,list(len=sum(as.integer64(len)),N=.N),by="trash"]
      print(kept)
      
      agg$len=NULL
      agg$N=NULL
      before = nrow(foo)
      foo=foo[agg]
      after = nrow(foo)
      stopifnot(before==after)
      foo=foo[trash==FALSE,]
      foo$trash=NULL
    }
    
    foo$time=paste(foo$time,"00",sep=":")
    foo$time=times(foo$time)
    foo$date=dates(foo$date,format = c(dates = "y-m-d"))    
    
    print(paste0(name,":",paste(names(foo),collapse=" ")))
    return(foo)
  })
  stopCluster(cl)
  foo = data_bind(files,tables)
    
  return(foo)
}
