library(nlme)
library(mgcv)
library(MuMIn)
library(multcomp)

#library(car)
#source("~/Documents/2015pCO2paper/UniversalRCode/multcompToGraph.R")
#source("~/Documents/2015pCO2paper/UniversalRCode/nlmeDiagPlots.R")
#source("~/Documents/2015pCO2paper/UniversalRCode/qqnormWithBounds.R")

# Reads in Japan data and reformats as needed
## the necessary data order is 
##'Expocode,longitude,latitude,yr,mon,day,hh,mm,sst,sss,fCO2rec,PPPP,Requ,Tequ,ETOPO2_depth'
read.in.data <- function(the.string) {
  raw.data <- read.csv(the.string)
  raw.data <- raw.data[,1:11]
  the.data <- na.omit(raw.data)
  #the.data <- the.data[the.data$fCO2rec<500,] # remove outliers per Hongjie
  the.data <- the.data[the.data$sss>30,] # remove outliers per Hongjie
  colnames(the.data)[1] <- "cruise"
  
  # c.date is character date
  # a.date is in Date format (days after Jan 1, 1970)
  # c.time is character time (24 hours)
  # p.date is Posix format time, seconds after midnight, Jan 1, 1970, zulu time
  #           not sure if original data in zulu time or the.data time
  # n.date is p.date converted to numeric format
  # m.date is numeric minutes after midnight, Jan 1, 1970, zulu time
  # doy is day of the year, with doy=1 on Jan 1
  the.data$c.date <- paste(the.data$yr, the.data$mon, the.data$day, sep="-")
  the.data$a.date <- as.Date(the.data$c.date,format="%Y-%m-%d")
  the.data$c.time <- paste(the.data$hh, the.data$mm, sep=":")
  the.data$p.date <- as.POSIXct(paste(the.data$c.date, the.data$c.time),format="%Y-%m-%d %H:%M")
  the.data$n.date <- as.numeric(the.data$p.date)
  the.data$m.date <- round(the.data$n.date/60)
  # Hah!  8 times are regarded as invalid because they occur during the shift forward
  # in daylight savings time.  For now, we'll remove them:
  the.data <- na.omit(the.data)
  the.data$doy <- 0
  for (a.yr in unique(the.data$yr)) {
    jan.1 <- as.Date(paste(a.yr,1,1,sep="-"), format="%Y-%m-%d")
    the.data$doy[the.data$yr==a.yr] <- the.data$a.date[the.data$yr==a.yr] - jan.1 + 1
  }
  the.data <- the.data[order(the.data$cruise, the.data$n.date),]
  the.data$f.mon <- factor(the.data$mon)
  the.data$yr.mon <- round(the.data$yr + the.data$mon/12 - 1/24,2)
  the.data$f.yr.mon <- factor(the.data$yr.mon)
  
  #  try subdividing cruises if gap > 180 minutes
  c.cruise <- as.character(the.data$cruise)
  for (u in unique(c.cruise)) {
    inc.ctr <- 1
    cruise.names <- rep(u, sum(c.cruise==u))
    dt <- diff(the.data$m.date[c.cruise==u])
    big.gaps <- (1:length(dt))[dt>180]
    if (length(big.gaps) > 0) {
      for (k in 1:length(big.gaps)) {
        inc.ctr <- inc.ctr+1
        cruise.names[(big.gaps[k]+1):length(cruise.names)] <- paste(u,inc.ctr,sep="_")
      }
    }
    c.cruise[c.cruise==u] <- cruise.names
  }
  
  #try subdividing cruises into subcruises of length 25 or less
  for (u in unique(c.cruise)) {
    inc.ctr <- 1
    l <- sum(c.cruise==u)
    cruise.names <- rep(u, l) 
    cruise.ctr <- 1
    while (l > 49) {
      inc.ctr <- inc.ctr+1
      cruise.names[cruise.ctr:length(cruise.names)] <- paste(u, inc.ctr, sep="s")
      cruise.ctr <- cruise.ctr + 25; l <- l-25
    }
    if (l > 25) {
      inc.ctr <- inc.ctr+1
      cruise.names[cruise.ctr:length(cruise.names)] <- paste(u, inc.ctr, sep="s")
      cruise.ctr <- cruise.ctr + round(l/2)    
      inc.ctr <- inc.ctr+1
      cruise.names[cruise.ctr:length(cruise.names)] <- paste(u, inc.ctr, sep="s")
    }
    c.cruise[c.cruise==u] <- cruise.names
  }
  the.data$cruise <- factor(c.cruise)
  the.data <- the.data[order(the.data$cruise, the.data$n.date),]
  # some datasets have multiple measurements in the same minute; average them
  unique.id <- factor(paste(the.data$cruise, the.data$m.date))
  the.data <- gsummary(the.data, groups=unique.id)
  
  return(the.data)
}

opt.fn <- function(params, y, dt) {
  phi <- exp(params[1]); sigma <- exp(params[2]); n <- length(y)
  diffs <- y[2:n]-(phi^dt)*y[1:(n-1)]
  #  print(diffs[1:10])
  #  print(c(phi, sigma))
  return(-sum(dnorm(diffs, 0, sigma, log=T)))
}

fit.one.cruise <- function(the.data, the.cruise) {
  y <- the.data$fCO2rec[the.data$cruise==the.cruise]
  t <- the.data$m.date[the.data$cruise==the.cruise]
  dt <- diff(t)
  bang <- optim(c(-0.01, 0), opt.fn, y=y-mean(y), dt=dt, lower=c(-Inf,-Inf), upper=c(-1e-6,6), method="L-BFGS-B")
  return(exp(bang$par))
}

examine.cruises <- function (the.data, t0.model) {
  n.cruises <- length(levels(the.data$cruise))
  cruise.summary <- matrix(0, n.cruises, 11)
  colnames(cruise.summary) <- c("mean.deviation.from.T0", "sd.dev", "n.obs", "total.time", "corExp.range", "cor.p.value", "dt", "rho", "exp.rho", "fit.rho", "fit.sigma")
  rownames(cruise.summary) <- levels(the.data$cruise)
  big.predict <- predict(t0.model$model, newdata=data.frame(times=the.data$yr.mon)) 
  big.predict <- big.predict - mean(the.data$fCO2rec)
  big.predict <- big.predict + t0.model$monthly.df$decycled.fco2[the.data$mon]
  for (i in 1:n.cruises) {
    cr <- levels(the.data$cruise)[i]
    #print(cr)
    x <- the.data$fCO2rec[the.data$cruise==cr]
    n <- length(x)
    delta.t <- diff(the.data$m.date[the.data$cruise==cr])
    devs <- x - big.predict[the.data$cruise==cr]
    cruise.summary[i,1] <- mean(devs)
    cruise.summary[i,3] <- sum(the.data$cruise==cr)
    cruise.summary[i,4] <- diff(range(the.data$m.date[the.data$cruise==cr]))
    if (cruise.summary[i,3] > 5) {
      slack <- gls(fCO2rec~1, data=the.data[the.data$cruise==cr,])
      whack <- gls(fCO2rec~1, corr=corExp(value=1000,form=~m.date), data=the.data[the.data$cruise==cr,], control=glsControl(returnObject=T, opt="optim"))
      cruise.summary[i,2] <- summary(whack)$sigma
      cruise.summary[i,5] <- exp(whack$modelStruct$corStruct[1])
      cruise.summary[i,6] <- anova(slack, whack)[2,9]
      if (diff(range(diff(the.data$m.date[the.data$cruise==cr])))==0) { # evenly spaced in time
        crack <- gls(fCO2rec~1, corr=corAR1(value=0.9,form=~1|cruise), data=the.data[the.data$cruise==cr,], control=glsControl(returnObject=T, opt="optim"))
        cruise.summary[i,7] <- diff(the.data$m.date[the.data$cruise==cr])[1]
        rhoish <- crack$modelStruct$corStruct[1]
        cruise.summary[i,8] <- exp(rhoish)/(1+exp(rhoish))*2-1
        cruise.summary[i,9] <- exp(-cruise.summary[i,7]/cruise.summary[i,5])
      }
      cruise.summary[i, 10:11] <- fit.one.cruise(the.data, cr)
    }
    if (25*trunc(i/25)==i) {print(i)}
  }
  return(cruise.summary)
}

gen.cruise.resids <- function(the.data) {
  resid.out <- numeric(nrow(the.data))
  for (l in levels(the.data$cruise)) {
    if (length(the.data$fCO2rec[the.data$cruise==l]) > 5) {
      ar.model <- ar(the.data$fCO2rec[the.data$cruise==l])
      resid.out[the.data$cruise==l] <- ar.model$resid # includes NA's
    }
  }
  return(resid.out)
}

# Fit a Takahashi "original" model on some subset of a dataset
## Step 1: Determine which 4 years have the densest data
bfy <- function(the.data) {
  y.m <- sort(unique(the.data$yr.mon))
  n.mos <- numeric(length(y.m))
  for (i in 1:length(y.m)) {
    n.mos[i] <- length(y.m[y.m >= y.m[i] & y.m < y.m[i]+4])
  }
  decent.choices <- y.m[n.mos>0.9*max(n.mos)]
  dist.from.med <- abs(decent.choices-median(y.m))
  possible.starts <- sort(decent.choices[dist.from.med==min(dist.from.med)])
  return(possible.starts[1]) # (in case there's a tie, return the earliest)
}
bfy.all3 <- function(the.data) {
  y.m <- sort(unique(the.data$yr.mon))
  n.mos <- numeric(length(y.m))
  for (i in 1:length(y.m)) {
    n.mos[i] <- length(y.m[y.m >= y.m[i] & y.m < y.m[i]+4])
  }
  decent.choices <- y.m[n.mos>0.9*max(n.mos)]
  return(decent.choices)
}
## Step 2: Find monthly means for the dataset.  To account for possible missing
## months at the start or end of the year, we extend the "year" in each direction.
fmm <- function(the.data, start.time) {
  the.data <- the.data[the.data$yr.mon >= start.time & the.data$yr.mon < (start.time+4),]
  the.data$f.mon <- factor(the.data$mon)
  monthly.means <- tapply(the.data$fCO2rec, the.data$f.mon, mean)
  months <- as.numeric(names(monthly.means))
  month.df <- data.frame(months=months, means=as.numeric(monthly.means))
  # There's a small3 chall3enge to interpolating if one of the missing months is 
  # December or January, so extend the means at each end
  month.df <- rbind(month.df[nrow(month.df),], month.df, month.df[1,])
  month.df$months[1] <- month.df$months[1] - 12
  month.df$months[nrow(month.df)] <- month.df$months[nrow(month.df)] + 12
  mm.out <- numeric(12)
  for (i in 1:12) {
    if (i %in% month.df$months) { # then we have that month in the data
      mm.out[i] <- month.df$means[month.df$months==i]
    }
    else { # we have to interpolate
      # find months on either side of the current month that we have data for
      j <- i-1; while (!(j %in% month.df$months)) {j <- j-1}
      k <- i+1; while (!(k %in% month.df$months)) {k <- k+1}
      mm.out[i] <- month.df$means[month.df$months==j] + 
        (i-j)/(k-j)*(month.df$means[month.df$months==k] - month.df$means[month.df$months==j])
    }
  }
  return(mm.out)
}
## Step 3:  Decycle data and reduce to monthly
decycle.data <- function(the.data, start.time, monthly.means) {
  short.data <- the.data[the.data$yr.mon >= start.time & the.data$yr.mon < (start.time+4),]
  four.yr.mean <- mean(short.data$fCO2rec)
  the.data$adjustment <- four.yr.mean-monthly.means[the.data$mon]
  the.data$fco2adj <- the.data$fCO2rec+the.data$adjustment
  decycled.fco2 <- tapply(the.data$fco2adj, the.data$f.yr.mon, mean)
  return(data.frame(decycled.fco2 = tapply(the.data$fco2adj, the.data$f.yr.mon, mean),
                    times = as.numeric(names(decycled.fco2))))
}
## Step 4:  Function collecting the above and fitting the model:
T0.model <- function(the.data, start.time=NA, monthly.means=NA) {
  if (is.na(start.time)) {start.time <- bfy(the.data)}
  if (is.na(monthly.means)) {monthly.means<- fmm(the.data, start.time)} 
  monthly.df <- decycle.data(the.data, start.time, monthly.means)
  return(list(monthly.df = monthly.df, 
              model = lm(decycled.fco2~times, data=monthly.df)))
}


# convenience function for fitting an annual spline
annual.spline <- function(the.data) {
  return(gam(fCO2rec~s(doy,bs="cp",k=12),
             knots=list(doy=seq(1,365,length=13)), 
             corr=corCompSymm(form=~1|c.date),data=the.data))
}


# convenience function for fitting a W4 model
W4.model <- function(the.data) {
  return(gamm(fCO2rec~s(doy,bs="cp",k=12) + sss*sst + I(sst^2) + I(sss^2) + a.date,
              knots=list(doy=seq(1,365,length=13)), 
              weights=varComb(varExp(form=~sss/30)),
              corr=corExp(form=~m.date|cruise), data=the.data,
              control=list(returnObject=TRUE, opt="optim")))
}


############ The final code of W4 method this paper used
poly.W4.model.ar1 <- function(the.data) {
  return(gamm(fCO2rec~s(doy,bs="cp",k=12) + sss*sst + I(sst^2) + a.date,
              knots=list(doy=seq(1,366,length=13)), 
              weights=varComb(varExp(form=~sss-30),varExp(form=~sst),varExp(form=~as.numeric(a.date)/13250)),
              random=list(cruise=~1),
              corr=corAR1(value=0.995, form=~m.date|cruise), data=the.data))
}
############ The W3 method this paper used without removing the heteroscedasticity 
poly.W4.model.hete <- function(the.data) {
  return(gamm(fCO2rec~s(doy,bs="cp",k=12) + sss*sst + I(sst^2) + a.date,
              knots=list(doy=seq(1,366,length=13)), 
              random=list(cruise=~1),
              corr=corAR1(value=0.995, form=~m.date|cruise), data=the.data))
}

plot.sss.sst <- function(poly.model) {
  sst.seq <- seq(5, 31, 0.1)
  sss.seq <- seq(30, 35.5, 0.025)
  z <- matrix(0, length(sst.seq), length(sss.seq))
  pt <- summary(poly.model$gam)$p.table
  for (i in 1:length(sst.seq)) {for (j in 1:length(sss.seq)) {z[i,j] <- pt[2,1]*sss.seq[j] + pt[3,1]*sst.seq[i] + pt[4,1]*sst.seq[i]^2 + pt[6,1]*sss.seq[j]*sst.seq[i]}}
  contour(sst.seq, sss.seq, z, xlab="SST", ylab="SSS")
  points(sss~sst, data=Japan, pch=16, cex=0.25, col="red")
}

correct.AIC <- function(gamm.model) {
  return(-2*logLik(gamm.model)+2*attr(logLik(gamm.model$gam), "df") + 2*attr(logLik(gamm.model$lme), "df"))
}


test=read.in.data("V3Japan_lat34long140.csv") ## reading the test grid  
T0.model.1<-T0.model(test)    ##T0 method
W4.model.1<-poly.W4.model.ar1(test)  ##W4 method
###writing into csv file with proper output
test.results.mx <- matrix(0, 1, 11) 
colnames(test.results.mx) <- c("min_long", "max_long","min_lati","max_lati",'p_W4','W4.trend','number.of.points','W4.error',
                               'BP SSS','BP SST','BP date')
test.results.mx[1,1] <- min(test[,2])  #min_longitude
test.results.mx[1,2] <- max(test[,2])  #max_longitude
test.results.mx[1,3] <- min(test[,3])  #min_latitude
test.results.mx[1,4] <- max(test[,3])  #max_latitude
test.results.mx[1,5] <- summary.gam(W4.model.1$gam)$p.table[5,4]  ### Pvalue for W4 method
test.results.mx[1,6] <- coef(W4.model.1$gam)[5]*365.25  ### trend derived by W4 method
test.results.mx[1,7] <- dim(test)[1]                 ### number of points in this grid
test.results.mx[1,8] <- summary.gam(W4.model.1$gam)$p.table[5,2]*365.25  ### W4 standard error
r<-resid(W4.model.1$lme,type="normalized")
test.results.mx[1,9] <- bptest(r~test[,10])$p.value  ### BP test Pvalue for SSS
test.results.mx[1,10] <- bptest(r~test[,9])$p.value   ### BP test Pvalue for SST
test.results.mx[1,11] <- bptest(r~test[,16])$p.value  ### BP test Pvalue for Sampling date
write.table(test.results.mx,'TestGridOutput.csv',sep=",", row.names=F)